Hi, I'd like to sollicit comments on this proposal: Currently mlock() holds mmap_sem in exclusive mode while the pages get faulted in. In the case of a large mlock, this can potentially take a very long time. I propose that mlock() could release mmap_sem after the VM_LOCKED bits have been set in all appropriate VMAs. Then a second pass could be done to actually mlock the pages, in small batches, never holding mmap_sem for longer than it takes to process one single batch. We need to recheck the vma flags whenever we re-acquire mmap_sem, but this is not difficult. This is only an RFC rather than an actual submission, as I think this could / should be completed to handle more than the mlock() and mlockall() cases (there are many call sites to mlock_vma_pages_range() that should ideally be converted as well), and maybe use the fault retry mechanism to drop mmap_sem when blocking on disk access rather than using an arbitrary page batch size. Patch is against v2.6.36, but should apply to linus tree too. ------------------------------- 8< ----------------------------- Let mlock / mlockall release mmap_sem after the vmas have been marked as VM_LOCKED. Then, mark the vmas as mlocked in small batches. For every batch, we need to grab mmap_sem in read mode, check that the vma has not been munlocked, and mlock the pages. In the case where a vma has been munlocked before mlock completes, pages that were already marked as PageMlocked() are handled by the munlock() call, and mlock() is careful to not mark new page batches as PageMlocked() after the munlock() call has cleared the VM_LOCKED vma flags. So, the end result will be identical to what'd happen if munlock() had executed after the mlock() call. Signed-off-by: Michel Lespinasse <walken@xxxxxxxxxx> --- mm/mlock.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++------------ 1 files changed, 63 insertions(+), 16 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b70919c..0aa4df5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -373,17 +373,11 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int lock = newflags & VM_LOCKED; if (newflags == vma->vm_flags || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) + (vma->vm_flags & (VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) goto out; /* don't set VM_LOCKED, don't count */ - if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current)) { - if (lock) - make_pages_present(start, end); - goto out; /* don't set VM_LOCKED, don't count */ - } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -419,14 +413,10 @@ success: * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - if (lock) { + if (lock) vma->vm_flags = newflags; - ret = __mlock_vma_pages_range(vma, start, end); - if (ret < 0) - ret = __mlock_posix_error_return(ret); - } else { + else munlock_vma_pages_range(vma, start, end); - } out: *prev = vma; @@ -439,7 +429,8 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * prev; int error; - len = PAGE_ALIGN(len); + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; @@ -482,6 +473,58 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } +static int do_mlock_pages(unsigned long start, size_t len) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend, nfault; + struct vm_area_struct *vma; + int error = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + down_read(&mm->mmap_sem); + nend = end; + vma = find_vma_intersection(mm, nstart, nend); + if (!vma) + goto up; + if (vma->vm_end < nend) + nend = vma->vm_end; + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + goto up; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + + /* + * Limit batch size to 256 pages in order to reduce + * mmap_sem hold time. + */ + nfault = nstart + 256 * PAGE_SIZE; + + /* + * Now fault in a batch of pages. We need to check the vma + * flags again, as we've not been holding mmap_sem. + */ + if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) { + if (nfault < nend) + nend = nfault; + make_pages_present(nstart, nend); + } else if (vma->vm_flags & VM_LOCKED) { + if (nfault < nend) + nend = nfault; + error = __mlock_vma_pages_range(vma, nstart, nend); + } + up: + up_read(&mm->mmap_sem); + if (error) + return __mlock_posix_error_return(error); + } + return 0; +} + SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; @@ -507,6 +550,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); + if (!error) + error = do_mlock_pages(start, len); return error; } @@ -571,6 +616,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); + if (!ret && (flags & MCL_CURRENT)) + ret = do_mlock_pages(0, TASK_SIZE); out: return ret; } -- Michel "Walken" Lespinasse A program is never fully debugged until the last user dies. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>