On box with both DRAM and PMEM managed by mm system, Usually node 0, 1 are DRAM nodes, nodes 2, 3 are PMEM nodes. nofallback list are same as before, fallback list are not redesigned to be arranged by node type basis, iow, allocation request of DRAM page start from node 0 will go through node0->node1->node2->node3 zonelists. Signed-off-by: Fan Du <fan.du@xxxxxxxxx> --- include/linux/mmzone.h | 8 ++++++++ mm/page_alloc.c | 42 ++++++++++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d3ee9f9..8c37e1c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -939,6 +939,14 @@ static inline int is_node_dram(int nid) return test_bit(PGDAT_DRAM, &pgdat->flags); } +static inline int is_node_same_type(int nida, int nidb) +{ + if (node_isset(nida, numa_nodes_pmem)) + return node_isset(nidb, numa_nodes_pmem); + else + return node_isset(nidb, numa_nodes_dram); +} + static inline void set_node_type(int nid) { pg_data_t *pgdat = NODE_DATA(nid); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c6ce20a..a408a91 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5372,7 +5372,7 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, * * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ -static int find_next_best_node(int node, nodemask_t *used_node_mask) +static int find_next_best_node(int node, nodemask_t *used_node_mask, int need_same_type) { int n, val; int min_val = INT_MAX; @@ -5380,7 +5380,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ - if (!node_isset(node, *used_node_mask)) { + if (need_same_type && !node_isset(node, *used_node_mask)) { node_set(node, *used_node_mask); return node; } @@ -5391,6 +5391,12 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) if (node_isset(n, *used_node_mask)) continue; + if (need_same_type && !is_node_same_type(node, n)) + continue; + + if (!need_same_type && is_node_same_type(node, n)) + continue; + /* Use the distance array to find the distance */ val = node_distance(node, n); @@ -5472,31 +5478,35 @@ static void build_zonelists(pg_data_t *pgdat) int node, load, nr_nodes = 0; nodemask_t used_mask; int local_node, prev_node; + int need_same_type; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = nr_online_nodes; prev_node = local_node; - nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { - /* - * We don't want to pressure a particular node. - * So adding penalty to the first node in same - * distance group to make it round-robin. - */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) - node_load[node] = load; + for (need_same_type = 1; need_same_type >= 0; need_same_type--) { + nodes_clear(used_mask); + while ((node = find_next_best_node(local_node, &used_mask, + need_same_type)) >= 0) { + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) + node_load[node] = load; - node_order[nr_nodes++] = node; - prev_node = node; - load--; + node_order[nr_nodes++] = node; + prev_node = node; + load--; + } } - build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); + } #ifdef CONFIG_HAVE_MEMORYLESS_NODES -- 1.8.3.1