On Fri, 2013-01-25 at 18:05 -0800, Hugh Dickins wrote: > The new KSM NUMA merge_across_nodes knob introduces a problem, when it's > set to non-default 0: if a KSM page is migrated to a different NUMA node, > how do we migrate its stable node to the right tree? And what if that > collides with an existing stable node? > > ksm_migrate_page() can do no more than it's already doing, updating > stable_node->kpfn: the stable tree itself cannot be manipulated without > holding ksm_thread_mutex. So accept that a stable tree may temporarily > indicate a page belonging to the wrong NUMA node, leave updating until > the next pass of ksmd, just be careful not to merge other pages on to a How you not to merge other pages on to a misplaced page? I don't see it. > misplaced page. Note nid of holding tree in stable_node, and recognize > that it will not always match nid of kpfn. > > A misplaced KSM page is discovered, either when ksm_do_scan() next comes > around to one of its rmap_items (we now have to go to cmp_and_merge_page > even on pages in a stable tree), or when stable_tree_search() arrives at > a matching node for another page, and this node page is found misplaced. > > In each case, move the misplaced stable_node to a list of migrate_nodes > (and use the address of migrate_nodes as magic by which to identify them): > we don't need them in a tree. If stable_tree_search() finds no match for > a page, but it's currently exiled to this list, then slot its stable_node > right there into the tree, bringing all of its mappings with it; otherwise > they get migrated one by one to the original page of the colliding node. > stable_tree_search() is now modelled more like stable_tree_insert(), > in order to handle these insertions of migrated nodes. When node will be removed from migrate_nodes list and insert to stable tree? > > remove_node_from_stable_tree(), remove_all_stable_nodes() and > ksm_check_stable_tree() have to handle the migrate_nodes list as well as > the stable tree itself. Less obviously, we do need to prune the list of > stale entries from time to time (scan_get_next_rmap_item() does it once > each full scan): > whereas stale nodes in the stable tree get naturally > pruned as searches try to brush past them, these migrate_nodes may get > forgotten and accumulate. Hard to understand this description. Could you explain it? :) > Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx> What will happen if page node of an unstable tree migrate to a new numa node? Also need to handle colliding? > --- > mm/ksm.c | 164 +++++++++++++++++++++++++++++++++++++++++++---------- > 1 file changed, 134 insertions(+), 30 deletions(-) > > --- mmotm.orig/mm/ksm.c 2013-01-25 14:37:03.832206218 -0800 > +++ mmotm/mm/ksm.c 2013-01-25 14:37:06.880206290 -0800 > @@ -122,13 +122,25 @@ struct ksm_scan { > /** > * struct stable_node - node of the stable rbtree > * @node: rb node of this ksm page in the stable tree > + * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list > + * @list: linked into migrate_nodes, pending placement in the proper node tree > * @hlist: hlist head of rmap_items using this ksm page > - * @kpfn: page frame number of this ksm page > + * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) > + * @nid: NUMA node id of stable tree in which linked (may not match kpfn) > */ > struct stable_node { > - struct rb_node node; > + union { > + struct rb_node node; /* when node of stable tree */ > + struct { /* when listed for migration */ > + struct list_head *head; > + struct list_head list; > + }; > + }; > struct hlist_head hlist; > unsigned long kpfn; > +#ifdef CONFIG_NUMA > + int nid; > +#endif > }; > > /** > @@ -169,6 +181,9 @@ struct rmap_item { > static struct rb_root root_unstable_tree[MAX_NUMNODES]; > static struct rb_root root_stable_tree[MAX_NUMNODES]; > > +/* Recently migrated nodes of stable tree, pending proper placement */ > +static LIST_HEAD(migrate_nodes); > + > #define MM_SLOTS_HASH_BITS 10 > static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); > > @@ -311,11 +326,6 @@ static void insert_to_mm_slots_hash(stru > hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); > } > > -static inline int in_stable_tree(struct rmap_item *rmap_item) > -{ > - return rmap_item->address & STABLE_FLAG; > -} > - > /* > * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's > * page tables after it has passed through ksm_exit() - which, if necessary, > @@ -476,7 +486,6 @@ static void remove_node_from_stable_tree > { > struct rmap_item *rmap_item; > struct hlist_node *hlist; > - int nid; > > hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { > if (rmap_item->hlist.next) > @@ -488,8 +497,11 @@ static void remove_node_from_stable_tree > cond_resched(); > } > > - nid = get_kpfn_nid(stable_node->kpfn); > - rb_erase(&stable_node->node, &root_stable_tree[nid]); > + if (stable_node->head == &migrate_nodes) > + list_del(&stable_node->list); > + else > + rb_erase(&stable_node->node, > + &root_stable_tree[NUMA(stable_node->nid)]); > free_stable_node(stable_node); > } > > @@ -712,6 +724,7 @@ static int remove_stable_node(struct sta > static int remove_all_stable_nodes(void) > { > struct stable_node *stable_node; > + struct list_head *this, *next; > int nid; > int err = 0; > > @@ -726,6 +739,12 @@ static int remove_all_stable_nodes(void) > cond_resched(); > } > } > + list_for_each_safe(this, next, &migrate_nodes) { > + stable_node = list_entry(this, struct stable_node, list); > + if (remove_stable_node(stable_node)) > + err = -EBUSY; > + cond_resched(); > + } > return err; > } > > @@ -1113,25 +1132,30 @@ static struct page *try_to_merge_two_pag > */ > static struct page *stable_tree_search(struct page *page) > { > - struct rb_node *node; > - struct stable_node *stable_node; > int nid; > + struct rb_node **new; > + struct rb_node *parent; > + struct stable_node *stable_node; > + struct stable_node *page_node; > > - stable_node = page_stable_node(page); > - if (stable_node) { /* ksm page forked */ > + page_node = page_stable_node(page); > + if (page_node && page_node->head != &migrate_nodes) { > + /* ksm page forked */ > get_page(page); > return page; > } > > nid = get_kpfn_nid(page_to_pfn(page)); > - node = root_stable_tree[nid].rb_node; > +again: > + new = &root_stable_tree[nid].rb_node; > + parent = NULL; > > - while (node) { > + while (*new) { > struct page *tree_page; > int ret; > > cond_resched(); > - stable_node = rb_entry(node, struct stable_node, node); > + stable_node = rb_entry(*new, struct stable_node, node); > tree_page = get_ksm_page(stable_node, false); > if (!tree_page) > return NULL; > @@ -1139,10 +1163,11 @@ static struct page *stable_tree_search(s > ret = memcmp_pages(page, tree_page); > put_page(tree_page); > > + parent = *new; > if (ret < 0) > - node = node->rb_left; > + new = &parent->rb_left; > else if (ret > 0) > - node = node->rb_right; > + new = &parent->rb_right; > else { > /* > * Lock and unlock the stable_node's page (which > @@ -1152,13 +1177,49 @@ static struct page *stable_tree_search(s > * than kpage, but that involves more changes. > */ > tree_page = get_ksm_page(stable_node, true); > - if (tree_page) > + if (tree_page) { > unlock_page(tree_page); > - return tree_page; > + if (get_kpfn_nid(stable_node->kpfn) != > + NUMA(stable_node->nid)) { > + put_page(tree_page); > + goto replace; > + } > + return tree_page; > + } > + /* > + * There is now a place for page_node, but the tree may > + * have been rebalanced, so re-evaluate parent and new. > + */ > + if (page_node) > + goto again; > + return NULL; > } > } > > - return NULL; > + if (!page_node) > + return NULL; > + > + list_del(&page_node->list); > + DO_NUMA(page_node->nid = nid); > + rb_link_node(&page_node->node, parent, new); > + rb_insert_color(&page_node->node, &root_stable_tree[nid]); > + get_page(page); > + return page; > + > +replace: > + if (page_node) { > + list_del(&page_node->list); > + DO_NUMA(page_node->nid = nid); > + rb_replace_node(&stable_node->node, > + &page_node->node, &root_stable_tree[nid]); > + get_page(page); > + } else { > + rb_erase(&stable_node->node, &root_stable_tree[nid]); > + page = NULL; > + } > + stable_node->head = &migrate_nodes; Why still set this magic since node has already insert to the tree? > + list_add(&stable_node->list, stable_node->head); > + return page; > } > > /* > @@ -1215,6 +1276,7 @@ static struct stable_node *stable_tree_i > INIT_HLIST_HEAD(&stable_node->hlist); > stable_node->kpfn = kpfn; > set_page_stable_node(kpage, stable_node); > + DO_NUMA(stable_node->nid = nid); > rb_link_node(&stable_node->node, parent, new); > rb_insert_color(&stable_node->node, &root_stable_tree[nid]); > > @@ -1311,11 +1373,6 @@ struct rmap_item *unstable_tree_search_i > static void stable_tree_append(struct rmap_item *rmap_item, > struct stable_node *stable_node) > { > - /* > - * Usually rmap_item->nid is already set correctly, > - * but it may be wrong after switching merge_across_nodes. > - */ > - DO_NUMA(rmap_item->nid = get_kpfn_nid(stable_node->kpfn)); > rmap_item->head = stable_node; > rmap_item->address |= STABLE_FLAG; > hlist_add_head(&rmap_item->hlist, &stable_node->hlist); > @@ -1344,10 +1401,29 @@ static void cmp_and_merge_page(struct pa > unsigned int checksum; > int err; > > - remove_rmap_item_from_tree(rmap_item); > + stable_node = page_stable_node(page); > + if (stable_node) { > + if (stable_node->head != &migrate_nodes && > + get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { > + rb_erase(&stable_node->node, > + &root_stable_tree[NUMA(stable_node->nid)]); > + stable_node->head = &migrate_nodes; > + list_add(&stable_node->list, stable_node->head); > + } > + if (stable_node->head != &migrate_nodes && > + rmap_item->head == stable_node) > + return; > + } > > /* We first start with searching the page inside the stable tree */ > kpage = stable_tree_search(page); > + if (kpage == page && rmap_item->head == stable_node) { > + put_page(kpage); > + return; > + } > + > + remove_rmap_item_from_tree(rmap_item); > + > if (kpage) { > err = try_to_merge_with_ksm_page(rmap_item, page, kpage); > if (!err) { > @@ -1464,6 +1540,27 @@ static struct rmap_item *scan_get_next_r > */ > lru_add_drain_all(); > > + /* > + * Whereas stale stable_nodes on the stable_tree itself > + * get pruned in the regular course of stable_tree_search(), > + * those moved out to the migrate_nodes list can accumulate: > + * so prune them once before each full scan. > + */ > + if (!ksm_merge_across_nodes) { > + struct stable_node *stable_node; > + struct list_head *this, *next; > + struct page *page; > + > + list_for_each_safe(this, next, &migrate_nodes) { > + stable_node = list_entry(this, > + struct stable_node, list); > + page = get_ksm_page(stable_node, false); > + if (page) > + put_page(page); > + cond_resched(); > + } > + } > + > for (nid = 0; nid < nr_node_ids; nid++) > root_unstable_tree[nid] = RB_ROOT; > > @@ -1586,8 +1683,7 @@ static void ksm_do_scan(unsigned int sca > rmap_item = scan_get_next_rmap_item(&page); > if (!rmap_item) > return; > - if (!PageKsm(page) || !in_stable_tree(rmap_item)) > - cmp_and_merge_page(page, rmap_item); > + cmp_and_merge_page(page, rmap_item); > put_page(page); > } > } > @@ -1964,6 +2060,7 @@ static void ksm_check_stable_tree(unsign > unsigned long end_pfn) > { > struct stable_node *stable_node; > + struct list_head *this, *next; > struct rb_node *node; > int nid; > > @@ -1984,6 +2081,13 @@ static void ksm_check_stable_tree(unsign > cond_resched(); > } > } > + list_for_each_safe(this, next, &migrate_nodes) { > + stable_node = list_entry(this, struct stable_node, list); > + if (stable_node->kpfn >= start_pfn && > + stable_node->kpfn < end_pfn) > + remove_node_from_stable_tree(stable_node); > + cond_resched(); > + } > } > > static int ksm_memory_callback(struct notifier_block *self, > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>