On Wed, 8 Jun 2011 08:43:50 +0100 Mel Gorman <mgorman@xxxxxxx> wrote: > On Wed, Jun 08, 2011 at 09:42:19AM +0900, KAMEZAWA Hiroyuki wrote: > > On Wed, 8 Jun 2011 08:40:34 +0900 > > <SNIP> > > Missing a subject > > > > > With sparsemem, page_cgroup_init scans pfn from 0 to max_pfn. > > But this may scan a pfn which is not on any node and can access > > memmap which is not initialized. > > > > This makes page_cgroup_init() for SPARSEMEM node aware and remove > > a code to get nid from page->flags. (Then, we'll use valid NID > > always.) > > > > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > > --- > > mm/page_cgroup.c | 41 +++++++++++++++++++++++++++++++++-------- > > 1 file changed, 33 insertions(+), 8 deletions(-) > > > > Index: linux-3.0-rc1/mm/page_cgroup.c > > =================================================================== > > --- linux-3.0-rc1.orig/mm/page_cgroup.c > > +++ linux-3.0-rc1/mm/page_cgroup.c > > @@ -162,21 +162,25 @@ static void free_page_cgroup(void *addr) > > } > > #endif > > > > -static int __meminit init_section_page_cgroup(unsigned long pfn) > > +static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) > > { > > struct page_cgroup *base, *pc; > > struct mem_section *section; > > unsigned long table_size; > > unsigned long nr; > > - int nid, index; > > + int index; > > > > + /* > > + * Even if passed 'pfn' is not aligned to section, we need to align > > + * it to section boundary because of SPARSEMEM pfn calculation. > > + */ > > + pfn = ALIGN(pfn, PAGES_PER_SECTION); > > nr = pfn_to_section_nr(pfn); > > This comment is a bit opaque and from the context of the patch, > it's hard to know why the alignment is necessary. At least move the > alignment to beside where section->page_cgroup is set because it'll > be easier to understand what is going on and why. > ok. > > section = __nr_to_section(nr); > > > > if (section->page_cgroup) > > return 0; > > > > - nid = page_to_nid(pfn_to_page(pfn)); > > table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; > > base = alloc_page_cgroup(table_size, nid); > > > > @@ -228,7 +232,7 @@ int __meminit online_page_cgroup(unsigne > > for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { > > if (!pfn_present(pfn)) > > continue; > > - fail = init_section_page_cgroup(pfn); > > + fail = init_section_page_cgroup(pfn, nid); > > } > > if (!fail) > > return 0; > > @@ -285,14 +289,35 @@ void __init page_cgroup_init(void) > > { > > unsigned long pfn; > > int fail = 0; > > + int node; > > > > Very nit-picky but you sometimes use node and sometimes use nid. > Personally, nid is my preferred choice of name as its meaning is > unambigious. > ok. > > if (mem_cgroup_disabled()) > > return; > > > > - for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { > > - if (!pfn_present(pfn)) > > - continue; > > - fail = init_section_page_cgroup(pfn); > > + for_each_node_state(node, N_HIGH_MEMORY) { > > + unsigned long start_pfn, end_pfn; > > + > > + start_pfn = NODE_DATA(node)->node_start_pfn; > > + end_pfn = start_pfn + NODE_DATA(node)->node_spanned_pages; > > + /* > > + * Because we cannot trust page->flags of page out of node > > + * boundary, we skip pfn < start_pfn. > > + */ > > + for (pfn = start_pfn; > > + !fail && (pfn < end_pfn); > > + pfn = ALIGN(pfn + PAGES_PER_SECTION, PAGES_PER_SECTION)) { > > + if (!pfn_present(pfn)) > > + continue; > > Why did you not use pfn_valid()? > > pfn_valid checks a section has SECTION_HAS_MEM_MAP > pfn_present checks a section has SECTION_MARKED_PRESENT > > SECTION_MARKED_PRESENT does not necessarily mean mem_map has been > allocated although I admit that this is somewhat unlikely. I'm just > curious if you had a reason for avoiding pfn_valid()? > hm, maybe I misunderstand some. I'll use pfn_valid(). > > + /* > > + * Nodes can be overlapped > > + * We know some arch can have nodes layout as > > + * -------------pfn--------------> > > + * N0 | N1 | N2 | N0 | N1 | N2 |..... > > + */ > > + if (pfn_to_nid(pfn) != node) > > + continue; > > + fail = init_section_page_cgroup(pfn, node); > > + } > > } > > if (fail) { > > printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); > > > > FWIW, overall I think this is heading in the right direction. > Thank you. and I noticed I misunderstood what ALIGN() does. This patch is made agaisnt the latest mainline git tree. Tested on my host, at least. == >From 0485201fec6a9bbfabc4c2674756360c05080155 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Date: Wed, 8 Jun 2011 17:13:37 +0900 Subject: [PATCH] [BUGFIX] Avoid getting nid from invalid struct page at page_cgroup allocation. With sparsemem, page_cgroup_init scans pfn from 0 to max_pfn. But this may scan a pfn which is not on any node and can access memmap which is not initialized. This makes page_cgroup_init() for SPARSEMEM node aware and remove a code to get nid from page->flags. (Then, we'll use valid NID always.) Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Changelog: - moved pfn alignment calculation to nearby it's really meaningful - use "nid" instead of node. - use pfn_valid() instead of pfn_present(). - fixed usage of ALIGN...I misunderstood. --- mm/page_cgroup.c | 43 ++++++++++++++++++++++++++++++++++--------- 1 files changed, 34 insertions(+), 9 deletions(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 74ccff6..ccc0c87 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr) } #endif -static int __meminit init_section_page_cgroup(unsigned long pfn) +static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) { struct page_cgroup *base, *pc; struct mem_section *section; unsigned long table_size; unsigned long nr; - int nid, index; + int index; nr = pfn_to_section_nr(pfn); section = __nr_to_section(nr); @@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) if (section->page_cgroup) return 0; - nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; base = alloc_page_cgroup(table_size, nid); @@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) pc = base + index; init_page_cgroup(pc, nr); } - + /* + * Even if passed 'pfn' is not aligned to section, we need to align + * it to section boundary because of SPARSEMEM pfn calculation. + */ + pfn = pfn & ~(PAGES_PER_SECTION - 1); section->page_cgroup = base - pfn; total_usage += table_size; return 0; @@ -228,7 +231,7 @@ int __meminit online_page_cgroup(unsigned long start_pfn, for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { if (!pfn_present(pfn)) continue; - fail = init_section_page_cgroup(pfn); + fail = init_section_page_cgroup(pfn, nid); } if (!fail) return 0; @@ -285,14 +288,36 @@ void __init page_cgroup_init(void) { unsigned long pfn; int fail = 0; + int nid; if (mem_cgroup_disabled()) return; - for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { - if (!pfn_present(pfn)) - continue; - fail = init_section_page_cgroup(pfn); + for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long start_pfn, end_pfn; + + start_pfn = NODE_DATA(nid)->node_start_pfn; + end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages; + /* + * Because we cannot trust page->flags of page out of node + * boundary, we skip pfn < start_pfn. + */ + for (pfn = start_pfn; + !fail && (pfn < end_pfn); + pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { + + if (!pfn_valid(pfn)) + continue; + /* + * Nodes can be overlapped + * We know some arch can have nodes layout as + * -------------pfn--------------> + * N0 | N1 | N2 | N0 | N1 | N2 |..... + */ + if (pfn_to_nid(pfn) != nid) + continue; + fail = init_section_page_cgroup(pfn, nid); + } } if (fail) { printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); -- 1.7.4.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>