On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote: > Is there any reason why the approach below won't work? > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > index 8be9e34e786a..e21b68a39f13 100644 > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -2251,6 +2251,52 @@ static struct uprobe > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb > struct uprobe *uprobe = NULL; > struct vm_area_struct *vma; > > +#ifdef CONFIG_PER_VMA_LOCK > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags; > + struct file *vm_file; > + struct inode *vm_inode; > + unsigned long vm_pgoff, vm_start, vm_end; > + int vm_lock_seq; > + loff_t offset; > + > + rcu_read_lock(); > + > + vma = vma_lookup(mm, bp_vaddr); > + if (!vma) > + goto retry_with_lock; > + > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq); So vma->vm_lock_seq is only updated on vma_start_write() > + > + vm_file = READ_ONCE(vma->vm_file); > + vm_flags = READ_ONCE(vma->vm_flags); > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC) > + goto retry_with_lock; > + > + vm_inode = READ_ONCE(vm_file->f_inode); > + vm_pgoff = READ_ONCE(vma->vm_pgoff); > + vm_start = READ_ONCE(vma->vm_start); > + vm_end = READ_ONCE(vma->vm_end); None of those are written with WRITE_ONCE(), so this buys you nothing. Compiler could be updating them one byte at a time while you load some franken-update. Also, if you're in the middle of split_vma() you might not get a consistent set. > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end) > + goto retry_with_lock; > + > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start); > + uprobe = find_uprobe_rcu(vm_inode, offset); > + if (!uprobe) > + goto retry_with_lock; > + > + /* now double check that nothing about VMA changed */ > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq)) > + goto retry_with_lock; Since vma->vma_lock_seq is only ever updated at vma_start_write() you're checking you're in or after the same modification cycle. The point of sequence locks is to check you *IN* a modification cycle and retry if you are. You're now explicitly continuing if you're in a modification. You really need: seq++; wmb(); ... do modification wmb(); seq++; vs do { s = READ_ONCE(seq) & ~1; rmb(); ... read stuff } while (rmb(), seq != s); The thing to note is that seq will be odd while inside a modification and even outside, further if the pre and post seq are both even but not identical, you've crossed a modification and also need to retry. > + > + /* happy case, we speculated successfully */ > + rcu_read_unlock(); > + return uprobe; > + > +retry_with_lock: > + rcu_read_unlock(); > + uprobe = NULL; > +#endif > + > mmap_read_lock(mm); > vma = vma_lookup(mm, bp_vaddr); > if (vma) { > diff --git a/kernel/fork.c b/kernel/fork.c > index cc760491f201..211a84ee92b4 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void) > NULL); > files_cachep = kmem_cache_create("files_cache", > sizeof(struct files_struct), 0, > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU, > NULL); > fs_cachep = kmem_cache_create("fs_cache", > sizeof(struct fs_struct), 0,