[RFC][PATCH v2 08/21] mm: introduce and export pgdat peer_node

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Fan Du <fan.du@xxxxxxxxx>

Each CPU socket can have 1 DRAM and 1 PMEM node, we call them "peer nodes".
Migration between DRAM and PMEM will by default happen between peer nodes.

It's a temp solution. In multiple memory layers, a node can have both
promotion and demotion targets instead of a single peer node. User space
may also be able to infer promotion/demotion targets based on future
HMAT info.

Signed-off-by: Fan Du <fan.du@xxxxxxxxx>
Signed-off-by: Fengguang Wu <fengguang.wu@xxxxxxxxx>
---
 drivers/base/node.c    |   11 +++++++++++
 include/linux/mmzone.h |   12 ++++++++++++
 mm/page_alloc.c        |   29 +++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

--- linux.orig/drivers/base/node.c	2018-12-23 19:39:51.647261099 +0800
+++ linux/drivers/base/node.c	2018-12-23 19:39:51.643261112 +0800
@@ -242,6 +242,16 @@ static ssize_t type_show(struct device *
 }
 static DEVICE_ATTR(type, S_IRUGO, type_show, NULL);
 
+static ssize_t peer_node_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	int nid = dev->id;
+	struct pglist_data *pgdat = NODE_DATA(nid);
+
+	return sprintf(buf, "%d\n", pgdat->peer_node);
+}
+static DEVICE_ATTR(peer_node, S_IRUGO, peer_node_show, NULL);
+
 static struct attribute *node_dev_attrs[] = {
 	&dev_attr_cpumap.attr,
 	&dev_attr_cpulist.attr,
@@ -250,6 +260,7 @@ static struct attribute *node_dev_attrs[
 	&dev_attr_distance.attr,
 	&dev_attr_vmstat.attr,
 	&dev_attr_type.attr,
+	&dev_attr_peer_node.attr,
 	NULL
 };
 ATTRIBUTE_GROUPS(node_dev);
--- linux.orig/include/linux/mmzone.h	2018-12-23 19:39:51.647261099 +0800
+++ linux/include/linux/mmzone.h	2018-12-23 19:39:51.643261112 +0800
@@ -713,6 +713,18 @@ typedef struct pglist_data {
 	/* Per-node vmstats */
 	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
 	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
+
+	/*
+	 * Points to the nearest node in terms of latency
+	 * E.g. peer of node 0 is node 2 per SLIT
+	 * node distances:
+	 * node   0   1   2   3
+	 *   0:  10  21  17  28
+	 *   1:  21  10  28  17
+	 *   2:  17  28  10  28
+	 *   3:  28  17  28  10
+	 */
+	int	peer_node;
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
--- linux.orig/mm/page_alloc.c	2018-12-23 19:39:51.647261099 +0800
+++ linux/mm/page_alloc.c	2018-12-23 19:39:51.643261112 +0800
@@ -6926,6 +6926,34 @@ static void check_for_memory(pg_data_t *
 	}
 }
 
+/*
+ * Return the nearest peer node in terms of *locality*
+ * E.g. peer of node 0 is node 2 per SLIT
+ * node distances:
+ * node   0   1   2   3
+ *   0:  10  21  17  28
+ *   1:  21  10  28  17
+ *   2:  17  28  10  28
+ *   3:  28  17  28  10
+ */
+static int find_best_peer_node(int nid)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int peer = NUMA_NO_NODE;
+
+	for_each_online_node(n) {
+		if (n == nid)
+			continue;
+		val = node_distance(nid, n);
+		if (val < min_val) {
+			min_val = val;
+			peer = n;
+		}
+	}
+	return peer;
+}
+
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
@@ -7012,6 +7040,7 @@ void __init free_area_init_nodes(unsigne
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
+		pgdat->peer_node = find_best_peer_node(nid);
 	}
 }
 





[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux