Hi Hugh, On Fri, 2013-01-25 at 18:01 -0800, Hugh Dickins wrote: > Switching merge_across_nodes after running KSM is liable to oops on stale > nodes still left over from the previous stable tree. It's not something > that people will often want to do, but it would be lame to demand a reboot > when they're trying to determine which merge_across_nodes setting is best. > > How can this happen? We only permit switching merge_across_nodes when > pages_shared is 0, and usually set run 2 to force that beforehand, which > ought to unmerge everything: yet oopses still occur when you then run 1. > > Three causes: > > 1. The old stable tree (built according to the inverse merge_across_nodes) > has not been fully torn down. A stable node lingers until get_ksm_page() > notices that the page it references no longer references it: but the page > is not necessarily freed as soon as expected, particularly when swapcache. > When can this happen? > Fix this with a pass through the old stable tree, applying get_ksm_page() > to each of the remaining nodes (most found stale and removed immediately), > with forced removal of any left over. Unless the page is still mapped: > I've not seen that case, it shouldn't occur, but better to WARN_ON_ONCE > and EBUSY than BUG. > > 2. __ksm_enter() has a nice little optimization, to insert the new mm > just behind ksmd's cursor, so there's a full pass for it to stabilize > (or be removed) before ksmd addresses it. Nice when ksmd is running, > but not so nice when we're trying to unmerge all mms: we were missing > those mms forked and inserted behind the unmerge cursor. Easily fixed > by inserting at the end when KSM_RUN_UNMERGE. mms forked will be unmerged just after ksmd's cursor since they're inserted behind it, why will be missing? > > 3. It is possible for a KSM page to be faulted back from swapcache into > an mm, just after unmerge_and_remove_all_rmap_items() scanned past it. > Fix this by copying on fault when KSM_RUN_UNMERGE: but that is private > to ksm.c, so dissolve the distinction between ksm_might_need_to_copy() > and ksm_does_need_to_copy(), doing it all in the one call into ksm.c. > Make sense. :) > A long outstanding, unrelated bugfix sneaks in with that third fix: > ksm_does_need_to_copy() would copy from a !PageUptodate page (implying > I/O error when read in from swap) to a page which it then marks Uptodate. > Fix this case by not copying, letting do_swap_page() discover the error. > > Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx> > --- > include/linux/ksm.h | 18 ++------- > mm/ksm.c | 83 +++++++++++++++++++++++++++++++++++++++--- > mm/memory.c | 19 ++++----- > 3 files changed, 92 insertions(+), 28 deletions(-) > > --- mmotm.orig/include/linux/ksm.h 2013-01-25 14:27:58.220193250 -0800 > +++ mmotm/include/linux/ksm.h 2013-01-25 14:37:00.764206145 -0800 > @@ -16,9 +16,6 @@ > struct stable_node; > struct mem_cgroup; > > -struct page *ksm_does_need_to_copy(struct page *page, > - struct vm_area_struct *vma, unsigned long address); > - > #ifdef CONFIG_KSM > int ksm_madvise(struct vm_area_struct *vma, unsigned long start, > unsigned long end, int advice, unsigned long *vm_flags); > @@ -73,15 +70,8 @@ static inline void set_page_stable_node( > * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, > * but what if the vma was unmerged while the page was swapped out? > */ > -static inline int ksm_might_need_to_copy(struct page *page, > - struct vm_area_struct *vma, unsigned long address) > -{ > - struct anon_vma *anon_vma = page_anon_vma(page); > - > - return anon_vma && > - (anon_vma->root != vma->anon_vma->root || > - page->index != linear_page_index(vma, address)); > -} > +struct page *ksm_might_need_to_copy(struct page *page, > + struct vm_area_struct *vma, unsigned long address); > > int page_referenced_ksm(struct page *page, > struct mem_cgroup *memcg, unsigned long *vm_flags); > @@ -113,10 +103,10 @@ static inline int ksm_madvise(struct vm_ > return 0; > } > > -static inline int ksm_might_need_to_copy(struct page *page, > +static inline struct page *ksm_might_need_to_copy(struct page *page, > struct vm_area_struct *vma, unsigned long address) > { > - return 0; > + return page; > } > > static inline int page_referenced_ksm(struct page *page, > --- mmotm.orig/mm/ksm.c 2013-01-25 14:36:58.856206099 -0800 > +++ mmotm/mm/ksm.c 2013-01-25 14:37:00.768206145 -0800 > @@ -644,6 +644,57 @@ static int unmerge_ksm_pages(struct vm_a > /* > * Only called through the sysfs control interface: > */ > +static int remove_stable_node(struct stable_node *stable_node) > +{ > + struct page *page; > + int err; > + > + page = get_ksm_page(stable_node, true); > + if (!page) { > + /* > + * get_ksm_page did remove_node_from_stable_tree itself. > + */ > + return 0; > + } > + > + if (WARN_ON_ONCE(page_mapped(page))) > + err = -EBUSY; > + else { > + /* > + * This page might be in a pagevec waiting to be freed, > + * or it might be PageSwapCache (perhaps under writeback), > + * or it might have been removed from swapcache a moment ago. > + */ > + set_page_stable_node(page, NULL); > + remove_node_from_stable_tree(stable_node); > + err = 0; > + } > + > + unlock_page(page); > + put_page(page); > + return err; > +} > + > +static int remove_all_stable_nodes(void) > +{ > + struct stable_node *stable_node; > + int nid; > + int err = 0; > + > + for (nid = 0; nid < nr_node_ids; nid++) { > + while (root_stable_tree[nid].rb_node) { > + stable_node = rb_entry(root_stable_tree[nid].rb_node, > + struct stable_node, node); > + if (remove_stable_node(stable_node)) { > + err = -EBUSY; > + break; /* proceed to next nid */ > + } > + cond_resched(); > + } > + } > + return err; > +} > + > static int unmerge_and_remove_all_rmap_items(void) > { > struct mm_slot *mm_slot; > @@ -691,6 +742,8 @@ static int unmerge_and_remove_all_rmap_i > } > } > > + /* Clean up stable nodes, but don't worry if some are still busy */ > + remove_all_stable_nodes(); > ksm_scan.seqnr = 0; > return 0; > > @@ -1586,11 +1639,19 @@ int __ksm_enter(struct mm_struct *mm) > spin_lock(&ksm_mmlist_lock); > insert_to_mm_slots_hash(mm, mm_slot); > /* > - * Insert just behind the scanning cursor, to let the area settle > + * When KSM_RUN_MERGE (or KSM_RUN_STOP), > + * insert just behind the scanning cursor, to let the area settle > * down a little; when fork is followed by immediate exec, we don't > * want ksmd to waste time setting up and tearing down an rmap_list. > + * > + * But when KSM_RUN_UNMERGE, it's important to insert ahead of its > + * scanning cursor, otherwise KSM pages in newly forked mms will be > + * missed: then we might as well insert at the end of the list. > */ > - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); > + if (ksm_run & KSM_RUN_UNMERGE) > + list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); > + else > + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); > spin_unlock(&ksm_mmlist_lock); > > set_bit(MMF_VM_MERGEABLE, &mm->flags); > @@ -1640,11 +1701,25 @@ void __ksm_exit(struct mm_struct *mm) > } > } > > -struct page *ksm_does_need_to_copy(struct page *page, > +struct page *ksm_might_need_to_copy(struct page *page, > struct vm_area_struct *vma, unsigned long address) > { > + struct anon_vma *anon_vma = page_anon_vma(page); > struct page *new_page; > > + if (PageKsm(page)) { > + if (page_stable_node(page) && > + !(ksm_run & KSM_RUN_UNMERGE)) > + return page; /* no need to copy it */ > + } else if (!anon_vma) { > + return page; /* no need to copy it */ > + } else if (anon_vma->root == vma->anon_vma->root && > + page->index == linear_page_index(vma, address)) { > + return page; /* still no need to copy it */ > + } > + if (!PageUptodate(page)) > + return page; /* let do_swap_page report the error */ > + > new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); > if (new_page) { > copy_user_highpage(new_page, page, address, vma); > @@ -2024,7 +2099,7 @@ static ssize_t merge_across_nodes_store( > > mutex_lock(&ksm_thread_mutex); > if (ksm_merge_across_nodes != knob) { > - if (ksm_pages_shared) > + if (ksm_pages_shared || remove_all_stable_nodes()) > err = -EBUSY; > else > ksm_merge_across_nodes = knob; > --- mmotm.orig/mm/memory.c 2013-01-25 14:27:58.220193250 -0800 > +++ mmotm/mm/memory.c 2013-01-25 14:37:00.768206145 -0800 > @@ -2994,17 +2994,16 @@ static int do_swap_page(struct mm_struct > if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) > goto out_page; > > - if (ksm_might_need_to_copy(page, vma, address)) { > - swapcache = page; > - page = ksm_does_need_to_copy(page, vma, address); > - > - if (unlikely(!page)) { > - ret = VM_FAULT_OOM; > - page = swapcache; > - swapcache = NULL; > - goto out_page; > - } > + swapcache = page; > + page = ksm_might_need_to_copy(page, vma, address); > + if (unlikely(!page)) { > + ret = VM_FAULT_OOM; > + page = swapcache; > + swapcache = NULL; > + goto out_page; > } > + if (page == swapcache) > + swapcache = NULL; > > if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { > ret = VM_FAULT_OOM; > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>