[PATCH] drivers/base/node.c: export physical address range of given node (Re: NUMA node information for pages)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



# Sorry if duplicated, I might fail sending this before ...

(CC:ed linux-mm)

On Thu, Apr 10, 2014 at 03:06:49PM -0700, David Rientjes wrote:
> On Wed, 9 Apr 2014, Naoya Horiguchi wrote:
> 
> > >  [ And that block_size_bytes file is absolutely horrid, why are we
> > >    exporting all this information in hex and not telling anybody? ]
> > 
> > Indeed, this kind of implicit hex numbers are commonly used in many place.
> > I guess that it's maybe for historical reasons.
> > 
> 
> I think it was meant to be simple to that you could easily add the length 
> to the start, but it should at least prefix this with `0x'.  That code has 
> been around for years, though, so we probably can't fix it now.
> 
> > > I'd much prefer a single change that works for everybody and userspace can 
> > > rely on exporting accurate information as long as sysfs is mounted, and 
> > > not even need to rely on getpagesize() to convert from pfn to physical 
> > > address: just simple {start,end}_phys_addr files added to 
> > > /sys/devices/system/node/nodeN/ for node N.  Online information can 
> > > already be parsed for these ranges from /sys/devices/system/node/online.
> > 
> > OK, so what if some node has multiple address ranges?  I don't think that
> > start(end)_phys_addr simply returns minimum (maximum) possible address is optimal,
> > because users can't know about void range between valid address ranges
> > (non-exist pfn should not belong to any node).
> > Are printing multilined (or comma-separated) ranges preferable for example
> > like below?
> > 
> >   $ cat /sys/devices/system/node/nodeN/phys_addr
> >   0x0-0x80000000
> >   0x100000000-0x180000000
> > 
> 
> What the...?  nodeN should represent the pgdat for that node and a pgdat 
> can only have a single range.

Yes, that's right, but it seems to me that just node_start_pfn and node_end_pfn
is not enough because there can be holes (without any page struct backed) inside
[node_start_pfn, node_end_pfn), and it's not aware of memory hotplug.

> I'm suggesting that 
> /sys/devices/system/node/nodeN/start_phys_addr returns 
> node_start_pfn(N) << PAGE_SHIFT and 
> /sys/devices/system/node/nodeN/end_phys_addr returns
> node_end_pfn(N) << PAGE_SHIFT and prefix them correctly this time with 
> `0x'.

Yes, we should have '0x' prefix for new interfaces.

I wrote a patch, so could you take a look?

I tested !CONFIG_MEMORY_HOTPLUG_SPARSE case too, and it works fine.

Thanks,
Naoya Horiguchi
---
From: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
Date: Thu, 10 Apr 2014 20:59:59 -0400
Subject: [PATCH] drivers/base/node.c: export physical address range of given
 node

Userspace applications sometimes want to know which node a give page belongs to,
so this patch adds a new interface /sys/devices/system/node/nodeN/phys_addr
which gives the physical address ranges of node N. A reader gets multilined
physical address ranges backed by valid pfn (neither holes within the zone nor
offlined range are not included in the output.)

Here is the example.

  # cat /sys/devices/system/node/node0/phys_addr
  0x1000-0x40000000
  # cat /sys/devices/system/node/node3/phys_addr
  0xc0000000-0xe0000000
  0x100000000-0x120000000
  # echo offline > /sys/devices/system/memory/memory33/state
  # cat /sys/devices/system/node/node3/phys_addr
  0xc0000000-0xe0000000
  0x100000000-0x108000000
  0x110000000-0x120000000

Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
---
 drivers/base/node.c    | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/memory.h |  4 ++++
 include/linux/mmzone.h |  1 +
 3 files changed, 65 insertions(+)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index bc9f43bf7e29..21e87909ec4f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -18,6 +18,7 @@
 #include <linux/device.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <linux/memblock.h>
 
 static struct bus_type node_subsys = {
 	.name = "node",
@@ -185,6 +186,64 @@ static ssize_t node_read_vmstat(struct device *dev,
 }
 static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
 
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+static bool valid_mem_section(unsigned long sec_nr)
+{
+	struct memory_block *memblk;
+	if (!present_section_nr(sec_nr))
+		return false;
+	memblk = find_memory_block(__nr_to_section(sec_nr));
+	if (!memblk || is_memblock_offlined(memblk))
+		return false;
+	return true;
+}
+#endif
+
+static ssize_t node_read_phys_addr(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pglist_data *pgdat = NODE_DATA(dev->id);
+	unsigned long start, end;
+	int n = 0;
+	unsigned long pfn;
+	unsigned long rstart = 0, rend = 0; /* range's start and end */
+
+	start = pgdat->node_start_pfn;
+	end = pgdat_end_pfn(pgdat);
+	pfn = start;
+	while (pfn < end) {
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+		if (unlikely(!valid_mem_section(pfn_to_section_nr(pfn)))) {
+			if (rstart) {
+				n += sprintf(buf + n, "0x%llx-0x%llx\n",
+					     PFN_PHYS(rstart), PFN_PHYS(rend));
+				rstart = 0;
+				rend = 0;
+			}
+			pfn = SECTION_NEXT_BOUNDARY(pfn);
+			continue;
+		}
+#endif
+		if (pfn_valid(pfn)) {
+			if (!rstart)
+				rstart = pfn;
+			rend = pfn + 1;
+		} else {
+			if (rstart)
+				n += sprintf(buf + n, "0x%llx-0x%llx\n",
+					     PFN_PHYS(rstart), PFN_PHYS(rend));
+			rstart = 0;
+			rend = 0;
+		}
+		pfn++;
+	}
+	if (rstart)
+		n += sprintf(buf + n, "0x%llx-0x%llx\n",
+			     PFN_PHYS(rstart), PFN_PHYS(rend));
+	return n;
+}
+static DEVICE_ATTR(phys_addr, S_IRUGO, node_read_phys_addr, NULL);
+
 static ssize_t node_read_distance(struct device *dev,
 			struct device_attribute *attr, char * buf)
 {
@@ -288,6 +347,7 @@ static int register_node(struct node *node, int num, struct node *parent)
 		device_create_file(&node->dev, &dev_attr_numastat);
 		device_create_file(&node->dev, &dev_attr_distance);
 		device_create_file(&node->dev, &dev_attr_vmstat);
+		device_create_file(&node->dev, &dev_attr_phys_addr);
 
 		scan_unevictable_register_node(node);
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index bb7384e3c3d8..b68e14f8818c 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -103,6 +103,10 @@ static inline int memory_isolate_notify(unsigned long val, void *v)
 {
 	return 0;
 }
+static inline struct memory_block *find_memory_block(struct mem_section *ms)
+{
+	return NULL;
+}
 #else
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9b61b9bf81ac..138a7a6684ab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1084,6 +1084,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 
 #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
+#define SECTION_NEXT_BOUNDARY(pfn)	(SECTION_ALIGN_DOWN(pfn) + PAGES_PER_SECTION)
 
 struct page;
 struct page_cgroup;
-- 
1.9.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]