David Hildenbrand <david@xxxxxxxxxx> writes: > On 07.11.24 18:32, Asahi Lina wrote: >> On 11/8/24 2:14 AM, David Hildenbrand wrote: >>> On 07.11.24 17:43, Asahi Lina wrote: >>>> On 11/5/24 9:03 PM, David Hildenbrand wrote: >>>>> On 01.11.24 13:08, Asahi Lina wrote: >>>>>> If the source page is a PFN mapping, we copy back from userspace. >>>>>> However, if this fault is a remote access, we cannot use >>>>>> __copy_from_user_inatomic. Instead, use access_remote_vm() in this >>>>>> case. >>>>>> >>>>>> Fixes WARN and incorrect zero-filling when writing to CoW mappings in >>>>>> a remote process, such as when using gdb on a binary present on a DAX >>>>>> filesystem. >>>>>> >>>>>> [ 143.683782] ------------[ cut here ]------------ >>>>>> [ 143.683784] WARNING: CPU: 1 PID: 350 at mm/memory.c:2904 >>>>>> __wp_page_copy_user+0x120/0x2bc >>>>>> [ 143.683793] CPU: 1 PID: 350 Comm: gdb Not tainted 6.6.52 #1 >>>>>> [ 143.683794] Hardware name: linux,dummy-virt (DT) >>>>>> [ 143.683795] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS >>>>>> BTYPE=--) >>>>>> [ 143.683796] pc : __wp_page_copy_user+0x120/0x2bc >>>>>> [ 143.683798] lr : __wp_page_copy_user+0x254/0x2bc >>>>>> [ 143.683799] sp : ffff80008272b8b0 >>>>>> [ 143.683799] x29: ffff80008272b8b0 x28: 0000000000000000 x27: >>>>>> ffff000083bad580 >>>>>> [ 143.683801] x26: 0000000000000000 x25: 0000fffff7fd5000 x24: >>>>>> ffff000081db04c0 >>>>>> [ 143.683802] x23: ffff00014f24b000 x22: fffffc00053c92c0 x21: >>>>>> ffff000083502150 >>>>>> [ 143.683803] x20: 0000fffff7fd5000 x19: ffff80008272b9d0 x18: >>>>>> 0000000000000000 >>>>>> [ 143.683804] x17: ffff000081db0500 x16: ffff800080fe52a0 x15: >>>>>> 0000fffff7fd5000 >>>>>> [ 143.683804] x14: 0000000000bb1845 x13: 0000000000000080 x12: >>>>>> ffff80008272b880 >>>>>> [ 143.683805] x11: ffff000081d13600 x10: ffff000081d13608 x9 : >>>>>> ffff000081d1360c >>>>>> [ 143.683806] x8 : ffff000083a16f00 x7 : 0000000000000010 x6 : >>>>>> ffff00014f24b000 >>>>>> [ 143.683807] x5 : ffff00014f24c000 x4 : 0000000000000000 x3 : >>>>>> ffff000083582000 >>>>>> [ 143.683807] x2 : 0000000000000f80 x1 : 0000fffff7fd5000 x0 : >>>>>> 0000000000001000 >>>>>> [ 143.683808] Call trace: >>>>>> [ 143.683809] __wp_page_copy_user+0x120/0x2bc >>>>>> [ 143.683810] wp_page_copy+0x98/0x5c0 >>>>>> [ 143.683813] do_wp_page+0x250/0x530 >>>>>> [ 143.683814] __handle_mm_fault+0x278/0x284 >>>>>> [ 143.683817] handle_mm_fault+0x64/0x1e8 >>>>>> [ 143.683819] faultin_page+0x5c/0x110 >>>>>> [ 143.683820] __get_user_pages+0xc8/0x2f4 >>>>>> [ 143.683821] get_user_pages_remote+0xac/0x30c >>>>>> [ 143.683823] __access_remote_vm+0xb4/0x368 >>>>>> [ 143.683824] access_remote_vm+0x10/0x1c >>>>>> [ 143.683826] mem_rw.isra.0+0xc4/0x218 >>>>>> [ 143.683831] mem_write+0x18/0x24 >>>>>> [ 143.683831] vfs_write+0xa0/0x37c >>>>>> [ 143.683834] ksys_pwrite64+0x7c/0xc0 >>>>>> [ 143.683834] __arm64_sys_pwrite64+0x20/0x2c >>>>>> [ 143.683835] invoke_syscall+0x48/0x10c >>>>>> [ 143.683837] el0_svc_common.constprop.0+0x40/0xe0 >>>>>> [ 143.683839] do_el0_svc+0x1c/0x28 >>>>>> [ 143.683841] el0_svc+0x3c/0xdc >>>>>> [ 143.683846] el0t_64_sync_handler+0x120/0x12c >>>>>> [ 143.683848] el0t_64_sync+0x194/0x198 >>>>>> [ 143.683849] ---[ end trace 0000000000000000 ]--- >>>>>> >>>>>> Signed-off-by: Asahi Lina <lina@xxxxxxxxxxxxx> >>>>>> --- >>>>>> mm/memory.c | 7 ++++++- >>>>>> 1 file changed, 6 insertions(+), 1 deletion(-) >>>>>> >>>>>> diff --git a/mm/memory.c b/mm/memory.c >>>>>> index >>>>>> 3ccee51adfbbd007b24331fe6874265f231a877b..dba25d9734063ac02cdaeb0a5cd5432473f6372e 100644 >>>>>> --- a/mm/memory.c >>>>>> +++ b/mm/memory.c >>>>>> @@ -3081,13 +3081,18 @@ static inline int __wp_page_copy_user(struct >>>>>> page *dst, struct page *src, >>>>>> update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); >>>>>> } >>>>>> + /* If the mm is a remote mm, copy in the page using >>>>>> access_remote_vm() */ >>>>>> + if (current->mm != mm) { >>>>>> + if (access_remote_vm(mm, (unsigned long)uaddr, kaddr, >>>>>> PAGE_SIZE, 0) != PAGE_SIZE) >>>>> >>>>> access_remote_vm() will do a mmap_read_lock_killable() and then call >>>>> into get_user_page_vma_remote() -- fortunately read-access, otherwise >>>>> we'd be in trouble :) . >>>>> >>>>> So we should already be holding the mmap read lock from the previous >>>>> access_remote_vm() users (who we end up here) ... doesn't this complain >>>>> with lockdep about recursive locking? >>>>> >>>>> I keep forgetting locking rules, so I might just be wrong. >>>> >>>> You're right, this complains with lockdep: >>>> >>>> [ 23.154031] >>>> [ 23.154093] ============================================ >>>> [ 23.154193] WARNING: possible recursive locking detected >>>> [ 23.154229] 6.6.52 #2 Not tainted >>>> [ 23.154270] -------------------------------------------- >>>> [ 23.154306] gdb/349 is trying to acquire lock: >>>> [ 23.154343] ffff0000862e3450 (&mm->mmap_lock){++++}-{3:3}, at: >>>> __access_remote_vm+0x3c/0x3a8 >>>> [ 23.154431] >>>> [ 23.154431] but task is already holding lock: >>>> [ 23.154474] ffff0000862e3450 (&mm->mmap_lock){++++}-{3:3}, at: >>>> __access_remote_vm+0x3c/0x3a8 >>>> [ 23.154553] >>>> [ 23.154553] other info that might help us debug this: >>>> [ 23.154598] Possible unsafe locking scenario: >>>> [ 23.154598] >>>> [ 23.154641] CPU0 >>>> [ 23.154665] ---- >>>> [ 23.154685] lock(&mm->mmap_lock); >>>> [ 23.154712] lock(&mm->mmap_lock); >>>> [ 23.154741] >>>> [ 23.154741] *** DEADLOCK *** >>>> [ 23.154741] >>>> [ 23.154790] May be due to missing lock nesting notation >>>> [ 23.154790] >>>> [ 23.154838] 2 locks held by gdb/349: >>>> [ 23.154868] #0: ffff0000835b53f8 (sb_writers#4){.+.+}-{0:0}, at: >>>> vfs_write+0x84/0x2e0 >>>> [ 23.154945] #1: ffff0000862e3450 (&mm->mmap_lock){++++}-{3:3}, at: >>>> __access_remote_vm+0x3c/0x3a8 >>>> [ 23.155023] >>>> [ 23.155023] stack backtrace: >>>> [ 23.155060] CPU: 5 PID: 349 Comm: gdb Not tainted 6.6.52 #2 >>>> [ 23.155112] Hardware name: linux,dummy-virt (DT) >>>> [ 23.155148] Call trace: >>>> [ 23.155167] dump_backtrace+0x98/0x118 >>>> [ 23.155209] show_stack+0x18/0x24 >>>> [ 23.155240] dump_stack_lvl+0x60/0xac >>>> [ 23.155292] dump_stack+0x18/0x24 >>>> [ 23.155320] print_deadlock_bug+0x260/0x34c >>>> [ 23.155364] validate_chain+0x364/0x4c0 >>>> [ 23.155393] __lock_acquire+0x564/0xb64 >>>> [ 23.155420] lock_acquire.part.0+0x9c/0x1bc >>>> [ 23.155448] lock_acquire+0x9c/0x140 >>>> [ 23.155477] down_read_killable+0x44/0x158 >>>> [ 23.155521] __access_remote_vm+0x3c/0x3a8 >>>> [ 23.155562] __wp_page_copy_user+0x13c/0x3a8 >>>> [ 23.155611] wp_page_copy+0x98/0x4d8 >>>> [ 23.155640] do_wp_page+0x290/0x594 >>>> [ 23.155671] __handle_mm_fault+0x258/0x25c >>>> [ 23.155712] handle_mm_fault+0x64/0x1f0 >>>> [ 23.155755] faultin_page+0x64/0x138 >>>> [ 23.155798] __get_user_pages+0x11c/0x340 >>>> [ 23.155843] get_user_pages_remote+0xc4/0x404 >>>> [ 23.155895] __access_remote_vm+0xf4/0x3a8 >>>> [ 23.155922] access_remote_vm+0x10/0x1c >>>> [ 23.155952] mem_rw.isra.0+0xc4/0x218 >>>> [ 23.155996] mem_write+0x18/0x24 >>>> [ 23.156023] vfs_write+0xa4/0x2e0 >>>> [ 23.156066] ksys_pwrite64+0x7c/0xc0 >>>> [ 23.156109] __arm64_sys_pwrite64+0x20/0x2c >>>> [ 23.156152] invoke_syscall+0x48/0x10c >>>> [ 23.156196] el0_svc_common.constprop.0+0x40/0xe0 >>>> [ 23.156249] do_el0_svc+0x1c/0x28 >>>> [ 23.156293] el0_svc+0x54/0x140 >>>> [ 23.156334] el0t_64_sync_handler+0x120/0x12c >>>> [ 23.156384] el0t_64_sync+0x194/0x198 >>>> >>>> I guess the locking implementation is recursive so that's why this >>>> didn't actually deadlock... >>>> >>>> I'm not sure what the right way to do this is then. The underlying >>>> reason why the fallback code is being called is that do_wp_page() calls >>>> vm_normal_page(), which returns NULL for VM_PFNMAP pages. So vmf->page >>>> is NULL and __wp_page_copy_user has to use the fallback path. However, >>>> the reason GUP works is that follow_page_pte() and friends have a >>>> specific fallback path for the pte_devmap() case that grabs a struct >>>> page anyway. Maybe similar logic should be in do_wp_page() so it can >>>> grab a struct page for PFN mappings too? >>> >>> There is currently WIP to remove pte_devmap() and make vm_normal_page() >>> return these pages as well. >>> >>> But that would not be in VM_PFNMAP mappings, because VM_PFNMAP means >>> "don't you ever look at the struct page". >>> >>> Likely, you do not have a VM_PFNMAP mapping here but instead a >>> VM_MIXEDMAP mapping(or likely no special mapping at all)? >>> >>> vm_normal_page() returns NULL for pte_devmap(), independent of >>> VM_PFNMAP, because pte_special() should succeed on them. >>> >>> >>> >>> I recall that there is still a problem with false-positives on >>> folio_test_anon() with ZONE_DEVICE pages, so it's maybe not that >>> easy ... and the whole get_dev_pagemap() stuff is nasty. Specifically FS DAX reuses PAGE_MAPPING_ANON in include/linux/page-flags.h /* * Different with flags above, this flag is used only for fsdax mode. It * indicates that this page->mapping is now under reflink case. */ #define PAGE_MAPPING_DAX_SHARED ((void *)0x1) FS DAX pages are never anon though, so you could probably test for !vma_is_dax() and/or add an implementation of is_fsdax_page(). >>> Likely we would have to do what GUP does, and temporarily grab a pgmap >>> reference. Gah. >>> >>> >>> So if we sort out the pagemap stuff and the possibly wrong >>> folio_test_anon() on some ZONE_DEVICE pages (but not all, because IIRC >>> DEVICE_PRIVATE can be anon ...), it might be doable. Correct, DEVICE_PRIVATE and DEVICE_COHERENT pages are always anon (at least for now). >>> But it sounds ugly, especially because that code might change soon and >>> not require messing with ZONE_DEVICE pages on that level. Yes, I'm hopoing to get the next version of that series posted this week. I found a couple of other FS DAX bugs that slowed me down. - Alistair >>> And then, we'd not be able to handle VM_PFNMAP cleanly ... >>> >>> >>> Maybe we could test if the PFN has a directmap and simply read using >>> that? I mean, that's what kmap_local_page() ends up doing on systems >>> without highmem ... and without !defined(HASHED_PAGE_VIRTUAL) && ! >>> defined(WANT_PAGE_VIRTUAL) the kmap_local_page() really just is a >>> page_to_virt(), which is mostly mapping a PFN to the corresponding >>> virtual address ... >>> >>> But it doesn't universally work ... >>> >>>> >>>> Or if the problem is just the lock, would just eliding the locking work? >>>> I guess that only works if all the calls into wp_page_copy() are >>>> guaranteed to hold the mmap lock already, but I don't know if that is >>>> true... >>> >>> The whole "GUP recursively calling into GUP" code looks concerning. >>> Could we even trigger a case where we get a recursive page fault handler >>> call, because of some odd race? (concurrent MADV_DONTNEED or similar) >>> >>> I think we should much rather fail this remote fault if there is no easy >>> way to make it work right now. >>> >>> At least I suspect this is primarily a "debugger" scenario that didn't >>> work so far and we could leave it "not working because not supported" in >>> a nicer way? >>> >>> >>> If this really must succeed, I can spend some time thinking about how to >>> do this cleaner ... >> Well, this breaks debuggers in general on a virtiofs VM mounted with >> DAX, which is a sensible use case I think. One reason to use DAX is >> avoiding duplication of the page cache between the host and the guest >> (or multiple guests). >> I think the main reason not that many people are trying DAX across >> the >> board for virtiofs is various bugs that have been slowly fixed, and this >> would be one of the remaining ones... >> (Full disclosure: For the use case I'm working on we're no longer >> mounting the whole rootfs with DAX right now (only a subset) since we're >> still evaluating the performance, but I'd like to keep the option open >> and having it break debuggers is kind of a blocker...) > > Thanks for the information. So it never worked and we primarily care about > virtio-fs DAX support, not some VM_PFNMAP mappings or other DAX mappings. > > > We should first fix the warning using something like: > > From 1ca7e9cf8067112ccaeb3c67230093d3aef8f2a3 Mon Sep 17 00:00:00 2001 > From: David Hildenbrand <david@xxxxxxxxxx> > Date: Fri, 8 Nov 2024 10:34:01 +0100 > Subject: [PATCH] mm/memory: silence warning in __wp_page_copy_user() on remote > faults without a src page > > Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> > --- > mm/memory.c | 36 ++++++++++++++++-------------------- > 1 file changed, 16 insertions(+), 20 deletions(-) > > diff --git a/mm/memory.c b/mm/memory.c > index 209885a4134f7..720b20f71ba61 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3038,28 +3038,29 @@ static inline int pte_unmap_same(struct vm_fault *vmf) > return same; > } > -/* > - * Return: > - * 0: copied succeeded > - * -EHWPOISON: copy failed due to hwpoison in source page > - * -EAGAIN: copied failed (some other reason) > - */ > -static inline int __wp_page_copy_user(struct page *dst, struct page *src, > - struct vm_fault *vmf) > +static inline vm_fault_t __wp_page_copy_user(struct page *dst, struct page *src, > + struct vm_fault *vmf) > { > - int ret; > void *kaddr; > void __user *uaddr; > struct vm_area_struct *vma = vmf->vma; > struct mm_struct *mm = vma->vm_mm; > unsigned long addr = vmf->address; > + vm_fault_t ret = 0; > if (likely(src)) { > if (copy_mc_user_highpage(dst, src, addr, vma)) > - return -EHWPOISON; > + return VM_FAULT_HWPOISON; > return 0; > } > + /* > + * We cannot copy from user, so remote faults without a page are not > + * supported for now. > + */ > + if (mm != current->mm) > + return VM_FAULT_SIGSEGV; > + > /* > * If the source page was a PFN mapping, we don't have > * a "struct page" for it. We do a best-effort copy by > @@ -3086,7 +3087,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, > */ > if (vmf->pte) > update_mmu_tlb(vma, addr, vmf->pte); > - ret = -EAGAIN; > + ret = VM_FAULT_RETRY; > goto pte_unlock; > } > @@ -3111,7 +3112,7 @@ static inline int __wp_page_copy_user(struct > page *dst, struct page *src, > /* The PTE changed under us, update local tlb */ > if (vmf->pte) > update_mmu_tlb(vma, addr, vmf->pte); > - ret = -EAGAIN; > + ret = VM_FAULT_RETRY; > goto pte_unlock; > } > @@ -3130,8 +3131,6 @@ static inline int __wp_page_copy_user(struct > page *dst, struct page *src, > } > } > - ret = 0; > - > pte_unlock: > if (vmf->pte) > pte_unmap_unlock(vmf->pte, vmf->ptl); > @@ -3369,23 +3368,20 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) > goto oom; > if (!pfn_is_zero) { > - int err; > - > - err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); > - if (err) { > + ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); > + if (unlikely(ret)) { > /* > * COW failed, if the fault was solved by other, > * it's fine. If not, userspace would re-fault on > * the same address and we will handle the fault > * from the second attempt. > - * The -EHWPOISON case will not be retried. > */ > folio_put(new_folio); > if (old_folio) > folio_put(old_folio); > delayacct_wpcopy_end(); > - return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; > + return ret == VM_FAULT_RETRY ? 0 : ret; > } > kmsan_copy_page_meta(&new_folio->page, vmf->page); > } > -- > 2.47.0 > > > > For MEMORY_DEVICE_FS_DAX, we should probably wait for [1], CCing Alistair and Dan. > > As discussed, maybe the following would work (as long as we don't get any > folio_test_anon() false-positives on these MEMORY_DEVICE_FS_DAX), but I'm > not quite happy about leaking these legacy MEMORY_DEVICE_FS_DAX into the > core code, especially as it might soon no longer be necessary. > > > From e84309bfa4772485b2340712d7b53a8a7ba1b0fc Mon Sep 17 00:00:00 2001 > From: David Hildenbrand <david@xxxxxxxxxx> > Date: Fri, 8 Nov 2024 10:50:42 +0100 > Subject: [PATCH] mm/memory: support legacy MEMORY_DEVICE_FS_DAX in > do_wp_page() > > Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> > --- > mm/memory.c | 28 +++++++++++++++++++++------- > 1 file changed, 21 insertions(+), 7 deletions(-) > > diff --git a/mm/memory.c b/mm/memory.c > index 720b20f71ba61..b3830aba08c53 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3667,7 +3667,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) > { > const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; > struct vm_area_struct *vma = vmf->vma; > + struct dev_pagemap *pgmap = NULL; > struct folio *folio = NULL; > + vm_fault_t ret = 0; > pte_t pte; > if (likely(!unshare)) { > @@ -3702,9 +3704,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) > } > vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); > - > - if (vmf->page) > + if (likely(vmf->page)) { > folio = page_folio(vmf->page); > + } else if (pte_devmap(vmf->orig_pte)) { > + pgmap = get_dev_pagemap(pte_pfn(pte), NULL); > + if (pgmap) { > + vmf->page = pte_page(pte); > + folio = page_folio(vmf->page); > + } > + } > /* > * Shared mapping: we are guaranteed to have VM_WRITE and > @@ -3719,8 +3727,10 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) > * Just mark the pages writable and/or call ops->pfn_mkwrite. > */ > if (!vmf->page) > - return wp_pfn_shared(vmf); > - return wp_page_shared(vmf, folio); > + ret = wp_pfn_shared(vmf); > + else > + ret = wp_page_shared(vmf, folio); > + goto out_pgmap; > } > /* > @@ -3736,10 +3746,10 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) > SetPageAnonExclusive(vmf->page); > if (unlikely(unshare)) { > pte_unmap_unlock(vmf->pte, vmf->ptl); > - return 0; > + goto out_pgmap; > } > wp_page_reuse(vmf, folio); > - return 0; > + goto out_pgmap; > } > /* > * Ok, we need to copy. Oh, well.. > @@ -3752,7 +3762,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) > if (folio && folio_test_ksm(folio)) > count_vm_event(COW_KSM); > #endif > - return wp_page_copy(vmf); > + ret = wp_page_copy(vmf); > +out_pgmap: > + if (unlikely(pgmap)) > + put_dev_pagemap(pgmap); > + return ret; > } > static void unmap_mapping_range_vma(struct vm_area_struct *vma, > -- > 2.47.0 > > > > [1] https://lkml.kernel.org/r/cover.9f0e45d52f5cff58807831b6b867084d0b14b61c.1725941415.git-series.apopple@xxxxxxxxxx