Subject: + mm-rearrange-madvise-code-to-allow-for-reuse.patch added to -mm tree To: ccross@xxxxxxxxxxx,dave.hansen@xxxxxxxxx,ebiederm@xxxxxxxxxxxx,gorcunov@xxxxxxxxxx,hannes@xxxxxxxxxxx,holt@xxxxxxx,hughd@xxxxxxxxxx,jan.glauber@xxxxxxxxx,john.stultz@xxxxxxxxxx,keescook@xxxxxxxxxxxx,mgorman@xxxxxxx,mingo@xxxxxxxxxx,oleg@xxxxxxxxxx,penberg@xxxxxxxxxx,peterz@xxxxxxxxxxxxx,riel@xxxxxxxxxx,rientjes@xxxxxxxxxx,rob@xxxxxxxxxxx,sasha.levin@xxxxxxxxxx,serge.hallyn@xxxxxxxxxx,shli@xxxxxxxxxxxx,tangchen@xxxxxxxxxxxxxx,viro@xxxxxxxxxxxxxxxxxx,walken@xxxxxxxxxx From: akpm@xxxxxxxxxxxxxxxxxxxx Date: Tue, 15 Oct 2013 14:28:45 -0700 The patch titled Subject: mm: rearrange madvise code to allow for reuse has been added to the -mm tree. Its filename is mm-rearrange-madvise-code-to-allow-for-reuse.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/mm-rearrange-madvise-code-to-allow-for-reuse.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/mm-rearrange-madvise-code-to-allow-for-reuse.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/SubmitChecklist when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Colin Cross <ccross@xxxxxxxxxxx> Subject: mm: rearrange madvise code to allow for reuse Refactor the madvise syscall to allow for parts of it to be reused by a prctl syscall that affects vmas. Move the code that walks vmas in a virtual address range into a function that takes a function pointer as a parameter. The only caller for now is sys_madvise, which uses it to call madvise_vma_behavior on each vma, but the next patch will add an additional caller. Move handling all vma behaviors inside madvise_behavior, and rename it to madvise_vma_behavior. Move the code that updates the flags on a vma, including splitting or merging the vma as necessary, into a new function called madvise_update_vma. The next patch will add support for updating a new anon_name field as well. Signed-off-by: Colin Cross <ccross@xxxxxxxxxxx> Cc: Pekka Enberg <penberg@xxxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Oleg Nesterov <oleg@xxxxxxxxxx> Cc: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> Cc: Jan Glauber <jan.glauber@xxxxxxxxx> Cc: John Stultz <john.stultz@xxxxxxxxxx> Cc: Rob Landley <rob@xxxxxxxxxxx> Cc: Cyrill Gorcunov <gorcunov@xxxxxxxxxx> Cc: Kees Cook <keescook@xxxxxxxxxxxx> Cc: "Serge E. Hallyn" <serge.hallyn@xxxxxxxxxx> Cc: David Rientjes <rientjes@xxxxxxxxxx> Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxx> Cc: Michel Lespinasse <walken@xxxxxxxxxx> Cc: Tang Chen <tangchen@xxxxxxxxxxxxxx> Cc: Robin Holt <holt@xxxxxxx> Cc: Shaohua Li <shli@xxxxxxxxxxxx> Cc: Sasha Levin <sasha.levin@xxxxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- mm/madvise.c | 272 +++++++++++++++++++++++++++---------------------- 1 file changed, 151 insertions(+), 121 deletions(-) diff -puN mm/madvise.c~mm-rearrange-madvise-code-to-allow-for-reuse mm/madvise.c --- a/mm/madvise.c~mm-rearrange-madvise-code-to-allow-for-reuse +++ a/mm/madvise.c @@ -39,65 +39,20 @@ static int madvise_need_mmap_write(int b } /* - * We can potentially split a vm area into separate - * areas, each area with its own behavior. + * Update the vm_flags on regiion of a vma, splitting it or merging it as + * necessary. Must be called with mmap_sem held for writing; */ -static long madvise_behavior(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) +static int madvise_update_vma(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long new_flags) { struct mm_struct *mm = vma->vm_mm; - int error = 0; pgoff_t pgoff; - unsigned long new_flags = vma->vm_flags; - - switch (behavior) { - case MADV_NORMAL: - new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; - break; - case MADV_SEQUENTIAL: - new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; - break; - case MADV_RANDOM: - new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; - break; - case MADV_DONTFORK: - new_flags |= VM_DONTCOPY; - break; - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTCOPY; - break; - case MADV_DONTDUMP: - new_flags |= VM_DONTDUMP; - break; - case MADV_DODUMP: - if (new_flags & VM_SPECIAL) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTDUMP; - break; - case MADV_MERGEABLE: - case MADV_UNMERGEABLE: - error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) - goto out; - break; - case MADV_HUGEPAGE: - case MADV_NOHUGEPAGE: - error = hugepage_madvise(vma, &new_flags, behavior); - if (error) - goto out; - break; - } + int error; if (new_flags == vma->vm_flags) { *prev = vma; - goto out; + return 0; } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -113,13 +68,13 @@ static long madvise_behavior(struct vm_a if (start != vma->vm_start) { error = split_vma(mm, vma, start, 1); if (error) - goto out; + return error; } if (end != vma->vm_end) { error = split_vma(mm, vma, end, 0); if (error) - goto out; + return error; } success: @@ -128,10 +83,7 @@ success: */ vma->vm_flags = new_flags; -out: - if (error == -ENOMEM) - error = -EAGAIN; - return error; + return 0; } #ifdef CONFIG_SWAP @@ -337,6 +289,77 @@ static long madvise_remove(struct vm_are return error; } +/* + * Apply an madvise behavior to a region of a vma. madvise_update_vma + * will handle splitting a vm area into separate areas, each area with its own + * behavior. + */ +static int madvise_vma_behavior(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long behavior) +{ + int error = 0; + unsigned long new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_DONTNEED: + return madvise_dontneed(vma, prev, start, end); + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTCOPY; + break; + case MADV_DONTDUMP: + new_flags |= VM_DONTDUMP; + break; + case MADV_DODUMP: + if (new_flags & VM_SPECIAL) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTDUMP; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; + } + + error = madvise_update_vma(vma, prev, start, end, new_flags); + +out: + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. @@ -375,22 +398,6 @@ static int madvise_hwpoison(int bhv, uns } #endif -static long -madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - return madvise_remove(vma, prev, start, end); - case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); - case MADV_DONTNEED: - return madvise_dontneed(vma, prev, start, end); - default: - return madvise_behavior(vma, prev, start, end, behavior); - } -} - static int madvise_behavior_valid(int behavior) { @@ -421,6 +428,73 @@ madvise_behavior_valid(int behavior) } /* + * Walk the vmas in range [start,end), and call the visit function on each one. + * The visit function will get start and end parameters that cover the overlap + * between the current vma and the original range. Any unmapped regions in the + * original range will result in this function returning -ENOMEM while still + * calling the visit function on all of the existing vmas in the range. + * Must be called with the mmap_sem held for reading or writing. + */ +static +int madvise_walk_vmas(unsigned long start, unsigned long end, + unsigned long arg, + int (*visit)(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long arg)) +{ + struct vm_area_struct *vma; + struct vm_area_struct *prev; + unsigned long tmp; + int unmapped_error = 0; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(current->mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + int error; + + /* Still start < end. */ + if (!vma) + return -ENOMEM; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + break; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = visit(vma, &prev, start, tmp, arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + if (start >= end) + break; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); + } + + return unmapped_error; +} + +/* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should @@ -464,9 +538,7 @@ madvise_behavior_valid(int behavior) */ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { - unsigned long end, tmp; - struct vm_area_struct *vma, *prev; - int unmapped_error = 0; + unsigned long end; int error = -EINVAL; int write; size_t len; @@ -501,52 +573,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, else down_read(¤t->mm->mmap_sem); - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - * - different from the way of handling in mlock etc. - */ - vma = find_vma_prev(current->mm, start, &prev); - if (vma && start > vma->vm_start) - prev = vma; - blk_start_plug(&plug); - for (;;) { - /* Still start < end. */ - error = -ENOMEM; - if (!vma) - goto out; - - /* Here start < (end|vma->vm_end). */ - if (start < vma->vm_start) { - unmapped_error = -ENOMEM; - start = vma->vm_start; - if (start >= end) - goto out; - } - - /* Here vma->vm_start <= start < (end|vma->vm_end) */ - tmp = vma->vm_end; - if (end < tmp) - tmp = end; - - /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = madvise_vma(vma, &prev, start, tmp, behavior); - if (error) - goto out; - start = tmp; - if (prev && start < prev->vm_end) - start = prev->vm_end; - error = unmapped_error; - if (start >= end) - goto out; - if (prev) - vma = prev->vm_next; - else /* madvise_remove dropped mmap_sem */ - vma = find_vma(current->mm, start); - } -out: + error = madvise_walk_vmas(start, end, behavior, madvise_vma_behavior); blk_finish_plug(&plug); + if (write) up_write(¤t->mm->mmap_sem); else _ Patches currently in -mm which might be from ccross@xxxxxxxxxxx are mm-rearrange-madvise-code-to-allow-for-reuse.patch mm-add-a-field-to-store-names-for-private-anonymous-memory.patch -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html