David Hildenbrand <david@xxxxxxxxxx> writes: > [...] > > Thanks for giving mu sugegstions a churn. I think we can further > improve/simplify some things. I added some comments, but might have more > regarding MMF_VM_MERGE_ANY / MMF_VM_MERGEABLE. > > [I'll try reowkring your patch after I send this mail to play with some > simplifications] > >> arch/s390/mm/gmap.c | 1 + >> include/linux/ksm.h | 23 +++++-- >> include/linux/sched/coredump.h | 1 + >> include/uapi/linux/prctl.h | 2 + >> kernel/fork.c | 1 + >> kernel/sys.c | 23 +++++++ >> mm/ksm.c | 111 ++++++++++++++++++++++++++------- >> mm/mmap.c | 7 +++ >> 8 files changed, 142 insertions(+), 27 deletions(-) >> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c >> index 5a716bdcba05..9d85e5589474 100644 >> --- a/arch/s390/mm/gmap.c >> +++ b/arch/s390/mm/gmap.c >> @@ -2591,6 +2591,7 @@ int gmap_mark_unmergeable(void) >> int ret; >> VMA_ITERATOR(vmi, mm, 0); >> + clear_bit(MMF_VM_MERGE_ANY, &mm->flags); > > Okay, that should keep the existing mechanism working. (but users can still mess > it up) > > Might be worth a comment > > /* > * Make sure to disable KSM (if enabled for the whole process or > * individual VMAs). Note that nothing currently hinders user space > * from re-enabling it. > */ > I'll add the comment. >> for_each_vma(vmi, vma) { >> /* Copy vm_flags to avoid partial modifications in ksm_madvise */ >> vm_flags = vma->vm_flags; >> diff --git a/include/linux/ksm.h b/include/linux/ksm.h >> index 7e232ba59b86..f24f9faf1561 100644 >> --- a/include/linux/ksm.h >> +++ b/include/linux/ksm.h >> @@ -18,20 +18,29 @@ >> #ifdef CONFIG_KSM >> int ksm_madvise(struct vm_area_struct *vma, unsigned long start, >> unsigned long end, int advice, unsigned long *vm_flags); >> -int __ksm_enter(struct mm_struct *mm); >> -void __ksm_exit(struct mm_struct *mm); >> + >> +int ksm_add_mm(struct mm_struct *mm); >> +void ksm_add_vma(struct vm_area_struct *vma); >> +void ksm_add_vmas(struct mm_struct *mm); >> + >> +int __ksm_enter(struct mm_struct *mm, int flag); >> +void __ksm_exit(struct mm_struct *mm, int flag); >> static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) >> { >> + if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags)) >> + return ksm_add_mm(mm); > > ksm_fork() runs before copying any VMAs. Copying the bit should be sufficient. > > Would it be possible to rework to something like: > > if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags)) > set_bit(MMF_VM_MERGE_ANY, &mm->flags) > if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) > return __ksm_enter(mm); > That will work. > work? IOW, not exporting ksm_add_mm() and not passing a flag to __ksm_enter() -- > it would simply set MMF_VM_MERGEABLE ? > ksm_add_mm() is also used in prctl (kernel/sys.c). Do you want to make a similar change there? > > I rememebr proposing that enabling MMF_VM_MERGE_ANY would simply enable > MMF_VM_MERGEABLE. > >> if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) >> - return __ksm_enter(mm); >> + return __ksm_enter(mm, MMF_VM_MERGEABLE); >> return 0; >> } >> static inline void ksm_exit(struct mm_struct *mm) >> { >> - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) >> - __ksm_exit(mm); >> + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) >> + __ksm_exit(mm, MMF_VM_MERGE_ANY); >> + else if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) >> + __ksm_exit(mm, MMF_VM_MERGEABLE); > > Can we do > > if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) > __ksm_exit(mm); > > And simply let __ksm_exit() clear both bits? > Yes, I'll make the change. >> } >> /* >> @@ -53,6 +62,10 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); >> #else /* !CONFIG_KSM */ >> > > [...] > >> #endif /* _LINUX_SCHED_COREDUMP_H */ >> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h >> index 1312a137f7fb..759b3f53e53f 100644 >> --- a/include/uapi/linux/prctl.h >> +++ b/include/uapi/linux/prctl.h >> @@ -290,4 +290,6 @@ struct prctl_mm_map { >> #define PR_SET_VMA 0x53564d41 >> # define PR_SET_VMA_ANON_NAME 0 >> +#define PR_SET_MEMORY_MERGE 67 >> +#define PR_GET_MEMORY_MERGE 68 >> #endif /* _LINUX_PRCTL_H */ >> diff --git a/kernel/fork.c b/kernel/fork.c >> index f68954d05e89..1520697cf6c7 100644 >> --- a/kernel/fork.c >> +++ b/kernel/fork.c >> @@ -686,6 +686,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, >> if (vma_iter_bulk_store(&vmi, tmp)) >> goto fail_nomem_vmi_store; >> + ksm_add_vma(tmp); > > Is this really required? The relevant VMAs should have VM_MERGEABLE set. > I'll fix it. >> mm->map_count++; >> if (!(tmp->vm_flags & VM_WIPEONFORK)) >> retval = copy_page_range(tmp, mpnt); >> diff --git a/kernel/sys.c b/kernel/sys.c >> index 495cd87d9bf4..9bba163d2d04 100644 >> --- a/kernel/sys.c >> +++ b/kernel/sys.c >> @@ -15,6 +15,7 @@ >> #include <linux/highuid.h> >> #include <linux/fs.h> >> #include <linux/kmod.h> >> +#include <linux/ksm.h> >> #include <linux/perf_event.h> >> #include <linux/resource.h> >> #include <linux/kernel.h> >> @@ -2661,6 +2662,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, >> case PR_SET_VMA: >> error = prctl_set_vma(arg2, arg3, arg4, arg5); >> break; >> +#ifdef CONFIG_KSM >> + case PR_SET_MEMORY_MERGE: >> + if (mmap_write_lock_killable(me->mm)) >> + return -EINTR; >> + >> + if (arg2) { >> + int err = ksm_add_mm(me->mm); >> + >> + if (!err) >> + ksm_add_vmas(me->mm); >> + } else { >> + clear_bit(MMF_VM_MERGE_ANY, &me->mm->flags); > > Okay, so disabling doesn't actually unshare anything. > >> + } >> + mmap_write_unlock(me->mm); >> + break; >> + case PR_GET_MEMORY_MERGE: >> + if (arg2 || arg3 || arg4 || arg5) >> + return -EINVAL; >> + >> + error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); >> + break; >> +#endif >> default: >> error = -EINVAL; >> break; >> diff --git a/mm/ksm.c b/mm/ksm.c >> index d7bd28199f6c..ab95ae0f9def 100644 >> --- a/mm/ksm.c >> +++ b/mm/ksm.c >> @@ -534,10 +534,33 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, >> return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; >> } >> +static bool vma_ksm_compatible(struct vm_area_struct *vma) >> +{ >> + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | >> + VM_IO | VM_DONTEXPAND | VM_HUGETLB | >> + VM_MIXEDMAP)) >> + return false; /* just ignore the advice */ >> + >> + if (vma_is_dax(vma)) >> + return false; >> + >> +#ifdef VM_SAO >> + if (vma->vm_flags & VM_SAO) >> + return false; >> +#endif >> +#ifdef VM_SPARC_ADI >> + if (vma->vm_flags & VM_SPARC_ADI) >> + return false; >> +#endif >> + >> + return true; >> +} >> + >> static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, >> unsigned long addr) >> { >> struct vm_area_struct *vma; >> + > > unrelated change > Removed. >> if (ksm_test_exit(mm)) >> return NULL; >> vma = vma_lookup(mm, addr); >> @@ -1065,6 +1088,7 @@ static int unmerge_and_remove_all_rmap_items(void) >> mm_slot_free(mm_slot_cache, mm_slot); >> clear_bit(MMF_VM_MERGEABLE, &mm->flags); >> + clear_bit(MMF_VM_MERGE_ANY, &mm->flags); >> mmdrop(mm); >> } else >> spin_unlock(&ksm_mmlist_lock); >> @@ -2495,6 +2519,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) >> mm_slot_free(mm_slot_cache, mm_slot); >> clear_bit(MMF_VM_MERGEABLE, &mm->flags); >> + clear_bit(MMF_VM_MERGE_ANY, &mm->flags); >> mmap_read_unlock(mm); >> mmdrop(mm); >> } else { >> @@ -2571,6 +2596,63 @@ static int ksm_scan_thread(void *nothing) >> return 0; >> } >> +static void __ksm_add_vma(struct vm_area_struct *vma) >> +{ >> + unsigned long vm_flags = vma->vm_flags; >> + >> + if (vm_flags & VM_MERGEABLE) >> + return; >> + >> + if (vma_ksm_compatible(vma)) { >> + vm_flags |= VM_MERGEABLE; >> + vm_flags_reset(vma, vm_flags); >> + } >> +} >> + >> +/** >> + * ksm_add_vma - Mark vma as mergeable > > "if compatible" > I'll added the above. >> + * >> + * @vma: Pointer to vma >> + */ >> +void ksm_add_vma(struct vm_area_struct *vma) >> +{ >> + struct mm_struct *mm = vma->vm_mm; >> + >> + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) >> + __ksm_add_vma(vma); >> +} >> + >> +/** >> + * ksm_add_vmas - Mark all vma's of a process as mergeable >> + * >> + * @mm: Pointer to mm >> + */ >> +void ksm_add_vmas(struct mm_struct *mm) > > I'd suggest calling this > I guess you forgot your name suggestion? >> +{ >> + struct vm_area_struct *vma; >> + >> + VMA_ITERATOR(vmi, mm, 0); >> + for_each_vma(vmi, vma) >> + __ksm_add_vma(vma); >> +} >> + >> +/** >> + * ksm_add_mm - Add mm to mm ksm list >> + * >> + * @mm: Pointer to mm >> + * >> + * Returns 0 on success, otherwise error code >> + */ >> +int ksm_add_mm(struct mm_struct *mm) >> +{ >> + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) >> + return -EINVAL; >> + if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) >> + return -EINVAL; >> + >> + return __ksm_enter(mm, MMF_VM_MERGE_ANY); >> +} >> + >> int ksm_madvise(struct vm_area_struct *vma, unsigned long start, >> unsigned long end, int advice, unsigned long *vm_flags) >> { >> @@ -2579,28 +2661,13 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, >> switch (advice) { >> case MADV_MERGEABLE: >> - /* >> - * Be somewhat over-protective for now! >> - */ >> - if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | >> - VM_PFNMAP | VM_IO | VM_DONTEXPAND | >> - VM_HUGETLB | VM_MIXEDMAP)) >> - return 0; /* just ignore the advice */ >> - >> - if (vma_is_dax(vma)) >> + if (vma->vm_flags & VM_MERGEABLE) >> return 0; >> - >> -#ifdef VM_SAO >> - if (*vm_flags & VM_SAO) >> + if (!vma_ksm_compatible(vma)) >> return 0; >> -#endif >> -#ifdef VM_SPARC_ADI >> - if (*vm_flags & VM_SPARC_ADI) >> - return 0; >> -#endif >> if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { >> - err = __ksm_enter(mm); >> + err = __ksm_enter(mm, MMF_VM_MERGEABLE); >> if (err) >> return err; >> } >> @@ -2626,7 +2693,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, >> } >> EXPORT_SYMBOL_GPL(ksm_madvise); >> -int __ksm_enter(struct mm_struct *mm) >> +int __ksm_enter(struct mm_struct *mm, int flag) >> { >> struct ksm_mm_slot *mm_slot; >> struct mm_slot *slot; >> @@ -2659,7 +2726,7 @@ int __ksm_enter(struct mm_struct *mm) >> list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); >> spin_unlock(&ksm_mmlist_lock); >> - set_bit(MMF_VM_MERGEABLE, &mm->flags); >> + set_bit(flag, &mm->flags); >> mmgrab(mm); >> if (needs_wakeup) >> @@ -2668,7 +2735,7 @@ int __ksm_enter(struct mm_struct *mm) >> return 0; >> } >> -void __ksm_exit(struct mm_struct *mm) >> +void __ksm_exit(struct mm_struct *mm, int flag) >> { >> struct ksm_mm_slot *mm_slot; >> struct mm_slot *slot; >> @@ -2700,7 +2767,7 @@ void __ksm_exit(struct mm_struct *mm) >> if (easy_to_free) { >> mm_slot_free(mm_slot_cache, mm_slot); >> - clear_bit(MMF_VM_MERGEABLE, &mm->flags); >> + clear_bit(flag, &mm->flags); >> mmdrop(mm); >> } else if (mm_slot) { >> mmap_write_lock(mm); >> diff --git a/mm/mmap.c b/mm/mmap.c >> index 740b54be3ed4..483e182e0b9d 100644 >> --- a/mm/mmap.c >> +++ b/mm/mmap.c >> @@ -46,6 +46,7 @@ >> #include <linux/pkeys.h> >> #include <linux/oom.h> >> #include <linux/sched/mm.h> >> +#include <linux/ksm.h> >> #include <linux/uaccess.h> >> #include <asm/cacheflush.h> >> @@ -2213,6 +2214,8 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, >> /* vma_complete stores the new vma */ >> vma_complete(&vp, vmi, vma->vm_mm); >> + ksm_add_vma(new); >> + > > Splitting a VMA shouldn't modify VM_MERGEABLE, so I assume this is not required? > I'll fix it. >> /* Success. */ >> if (new_below) >> vma_next(vmi); >> @@ -2664,6 +2667,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, >> if (file && vm_flags & VM_SHARED) >> mapping_unmap_writable(file->f_mapping); >> file = vma->vm_file; >> + ksm_add_vma(vma); >> expanded: >> perf_event_mmap(vma); >> @@ -2936,6 +2940,7 @@ static int do_brk_flags(struct vma_iterator *vmi, >> struct vm_area_struct *vma, >> goto mas_store_fail; >> mm->map_count++; >> + ksm_add_vma(vma); >> out: >> perf_event_mmap(vma); >> mm->total_vm += len >> PAGE_SHIFT; >> @@ -3180,6 +3185,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, >> if (vma_link(mm, new_vma)) >> goto out_vma_link; >> *need_rmap_locks = false; >> + ksm_add_vma(new_vma); > > Copying shouldn't modify VM_MERGEABLE, so I think this is not required? > I'll fix it. >> } >> validate_mm_mt(mm); >> return new_vma; >> @@ -3356,6 +3362,7 @@ static struct vm_area_struct *__install_special_mapping( >> vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); >> perf_event_mmap(vma); >> + ksm_add_vma(vma); > > IIUC, special mappings will never be considered a reasonable target for KSM > (especially, because at least VM_DONTEXPAND is always set). > > I think you can just drop this call. I dropped it.