On Fri, Dec 16, 2016 at 12:42:43PM +0100, Michal Hocko wrote: > On Fri 16-12-16 13:44:38, Kirill A. Shutemov wrote: > > On Fri, Dec 16, 2016 at 11:11:13AM +0100, Michal Hocko wrote: > > > On Fri 16-12-16 10:43:52, Vegard Nossum wrote: > > > [...] > > > > I don't think it's a bug in the OOM reaper itself, but either of the > > > > following two patches will fix the problem (without my understand how or > > > > why): > > > > > > > > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > > > > index ec9f11d4f094..37b14b2e2af4 100644 > > > > --- a/mm/oom_kill.c > > > > +++ b/mm/oom_kill.c > > > > @@ -485,7 +485,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, > > > > struct mm_struct *mm) > > > > */ > > > > mutex_lock(&oom_lock); > > > > > > > > - if (!down_read_trylock(&mm->mmap_sem)) { > > > > + if (!down_write_trylock(&mm->mmap_sem)) { > > > > > > __oom_reap_task_mm is basically the same thing as MADV_DONTNEED and that > > > doesn't require the exlusive mmap_sem. So this looks correct to me. > > > > BTW, shouldn't we filter out all VM_SPECIAL VMAs there? Or VM_PFNMAP at > > least. > > > > MADV_DONTNEED doesn't touch VM_PFNMAP, but I don't see anything matching > > on __oom_reap_task_mm() side. > > I guess you are right and we should match the MADV_DONTNEED behavior > here. Care to send a patch? Below. Testing required. > > Other difference is that you use unmap_page_range() witch doesn't touch > > mmu_notifiers. MADV_DONTNEED goes via zap_page_range(), which invalidates > > the range. Not sure if it can make any difference here. > > Which mmu notifier would care about this? I am not really familiar with > those users so I might miss something easily. No idea either. Is there any reason not to use zap_page_range here too? Few more notes: I propably miss something, but why do we need details->ignore_dirty? It only appiled for non-anon pages, but since we filter out shared mappings, how can we have pte_dirty() for !PageAnon()? check_swap_entries is also sloppy: the behavior doesn't match the comment: details == NULL makes it check swap entries. I removed it and restore details->check_mapping test as we had before. After the change no user of zap_page_range() wants non-NULL details, I've dropped the argument. If it looks okay, I'll split it into several patches with proper commit messages. -----8<----- diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index ec1f0dedb948..59ac93714fa4 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -687,7 +687,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) /* Find vma in the parent mm */ vma = find_vma(gmap->mm, vmaddr); size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size, NULL); + zap_page_range(vma, vmaddr, size); } up_read(&gmap->mm->mmap_sem); } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e4f800999b32..4bfb31e79d5d 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -796,7 +796,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm, return -EINVAL; len = min(vma->vm_end, end) - addr; - zap_page_range(vma, addr, len, NULL); + zap_page_range(vma, addr, len); trace_mpx_unmap_zap(addr, addr+len); vma = vma->vm_next; diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 3c71b982bf2a..d97f6725cf8c 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -629,7 +629,7 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; if (vma) zap_page_range(vma, (uintptr_t)page_addr + - proc->user_buffer_offset, PAGE_SIZE, NULL); + proc->user_buffer_offset, PAGE_SIZE); err_vm_insert_page_failed: unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); err_map_kernel_failed: diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index b653451843c8..0fb0e28ace70 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -865,8 +865,7 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer, list_for_each_entry(vma_list, &buffer->vmas, list) { struct vm_area_struct *vma = vma_list->vma; - zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, - NULL); + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } mutex_unlock(&buffer->lock); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4424784ac374..92dcada8caaf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1148,8 +1148,6 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - bool ignore_dirty; /* Ignore dirty pages */ - bool check_swap_entries; /* Check also swap entries */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, @@ -1160,7 +1158,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *); + unsigned long size); void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); diff --git a/mm/internal.h b/mm/internal.h index 44d68895a9b9..5c355855e4ad 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -41,10 +41,9 @@ int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -void unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - struct zap_details *details); +long madvise_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end); extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, diff --git a/mm/madvise.c b/mm/madvise.c index 0e3828eae9f8..8c9f19b62b4a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -468,7 +468,7 @@ static long madvise_free(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed(struct vm_area_struct *vma, +long madvise_dontneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { @@ -476,7 +476,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; - zap_page_range(vma, start, end - start, NULL); + zap_page_range(vma, start, end - start); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 455c3e628d52..f8836232a492 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1155,12 +1155,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (!PageAnon(page)) { if (pte_dirty(ptent)) { - /* - * oom_reaper cannot tear down dirty - * pages - */ - if (unlikely(details && details->ignore_dirty)) - continue; force_flush = 1; set_page_dirty(page); } @@ -1179,8 +1173,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, } continue; } - /* only check swap_entries if explicitly asked for in details */ - if (unlikely(details && !details->check_swap_entries)) + /* If details->check_mapping, we leave swap entries. */ + if (unlikely(details)) continue; entry = pte_to_swp_entry(ptent); @@ -1277,7 +1271,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, return addr; } -void unmap_page_range(struct mmu_gather *tlb, +static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) @@ -1381,7 +1375,7 @@ void unmap_vmas(struct mmu_gather *tlb, * Caller must protect the VMA list */ void zap_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long size, struct zap_details *details) + unsigned long size) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; @@ -1392,7 +1386,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); for ( ; vma && vma->vm_start < end; vma = vma->vm_next) - unmap_single_vma(&tlb, vma, start, end, details); + unmap_single_vma(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ec9f11d4f094..f6451eacb0aa 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -465,8 +465,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct zap_details details = {.check_swap_entries = true, - .ignore_dirty = true}; bool ret = true; /* @@ -481,7 +479,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * out_of_memory * select_bad_process * # no TIF_MEMDIE task selects new victim - * unmap_page_range # frees some memory + * madv_dontneed # frees some memory */ mutex_lock(&oom_lock); @@ -510,16 +508,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (is_vm_hugetlb_page(vma)) - continue; - - /* - * mlocked VMAs require explicit munlocking before unmap. - * Let's keep it simple here and skip such VMAs. - */ - if (vma->vm_flags & VM_LOCKED) - continue; - /* * Only anonymous pages have a good chance to be dropped * without additional steps which we cannot afford as we @@ -531,8 +519,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) - unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, - &details); + madvise_dontneed(vma, &vma, vma->vm_start, vma->vm_end); } tlb_finish_mmu(&tlb, 0, -1); pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", -- Kirill A. Shutemov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>