On Thu, 9 Jun 2011 10:04:34 +0900 KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote: > On Wed, 8 Jun 2011 12:15:11 +0200 > Johannes Weiner <hannes@xxxxxxxxxxx> wrote: > Thank you for review. updated. == >From a4f043565ad2388f930d956958b1167761e7d6a0 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Date: Wed, 8 Jun 2011 17:13:37 +0900 Subject: [PATCH] [BUGFIX] Avoid getting nid from invalid struct page at page_cgroup allocation. With sparsemem, page_cgroup_init scans pfn from 0 to max_pfn. But this may scan a pfn which is not on any node and can access memmap which is not initialized. This makes page_cgroup_init() for SPARSEMEM node aware and remove a code to get nid from page->flags. (Then, we'll use valid NID always.) This is a fix for Bug 36192. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Changelog: Jun/9 - use PAGE_SECTION_MASK - add comments for init_section_page_cgroup - changed return value for init_section_page_cgroup - changed loop style. - fixed nid handling at memory hotplug. Jun/8 - moved ALIGN(pfn, PAGES_PER_SECTION) to nearby it's really meaningful - use "nid" instead of node. - use pfn_valid() instead of pfn_present(). - fixed usage of ALIGN...I misunderstood. --- mm/page_cgroup.c | 86 ++++++++++++++++++++++++++++++++++++++--------------- 1 files changed, 61 insertions(+), 25 deletions(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 74ccff6..9ef0657 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -162,21 +162,28 @@ static void free_page_cgroup(void *addr) } #endif -static int __meminit init_section_page_cgroup(unsigned long pfn) +/** init_section_page_cgroup - initialize page_cgroup for a section. + * @pfn - pfn of a section should be initialized. + * @nid - nid where the memory should be allocated from. + * + * Allocates and initialized page_cgroup for a section. At success, this + * returns start_pfn of the next section. At failure, this returns 0. + */ +static unsigned long +__meminit init_section_page_cgroup(unsigned long pfn, int nid) { struct page_cgroup *base, *pc; struct mem_section *section; unsigned long table_size; unsigned long nr; - int nid, index; + int index; nr = pfn_to_section_nr(pfn); section = __nr_to_section(nr); if (section->page_cgroup) - return 0; + goto out; - nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; base = alloc_page_cgroup(table_size, nid); @@ -189,17 +196,22 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) if (!base) { printk(KERN_ERR "page cgroup allocation failure\n"); - return -ENOMEM; + return 0; } for (index = 0; index < PAGES_PER_SECTION; index++) { pc = base + index; init_page_cgroup(pc, nr); } - + /* + * Even if passed 'pfn' is not aligned to section, we need to align + * it to section boundary because of SPARSEMEM pfn calculation. + */ + pfn = pfn & PAGE_SECTION_MASK; section->page_cgroup = base - pfn; total_usage += table_size; - return 0; +out: + return ALIGN(pfn + 1, PAGES_PER_SECTION); } #ifdef CONFIG_MEMORY_HOTPLUG void __free_page_cgroup(unsigned long pfn) @@ -220,19 +232,23 @@ int __meminit online_page_cgroup(unsigned long start_pfn, int nid) { unsigned long start, end, pfn; - int fail = 0; start = start_pfn & ~(PAGES_PER_SECTION - 1); end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { - if (!pfn_present(pfn)) + if (nid == -1) + nid = pfn_to_nid(start_pfn); + + pfn = start_pfn; + while (pfn < end) { + if (!pfn_valid(pfn)) continue; - fail = init_section_page_cgroup(pfn); + pfn = init_section_page_cgroup(pfn, nid); + if (!pfn) + goto rollback; } - if (!fail) - return 0; - + return 0; +rollback: /* rollback */ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_cgroup(pfn); @@ -284,25 +300,45 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, void __init page_cgroup_init(void) { unsigned long pfn; - int fail = 0; + int nid; if (mem_cgroup_disabled()) return; - for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { - if (!pfn_present(pfn)) - continue; - fail = init_section_page_cgroup(pfn); - } - if (fail) { - printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); - panic("Out of memory"); - } else { - hotplug_memory_notifier(page_cgroup_callback, 0); + for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long start_pfn, end_pfn; + + start_pfn = NODE_DATA(nid)->node_start_pfn; + end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages; + /* + * Because we cannot trust page->flags of page out of node + * boundary, we skip pfn < start_pfn. + */ + pfn = start_pfn; + while (pfn < end_pfn) { + /* + * Nodes can be overlapped + * We know some arch can have nodes layout as + * -------------pfn--------------> + * N0 | N1 | N2 | N0 | N1 | N2 |..... + */ + if (!pfn_valid(pfn) || (pfn_to_nid(pfn) != nid)) { + pfn = ALIGN(pfn + 1, PAGES_PER_SECTION); + continue; + } + pfn = init_section_page_cgroup(pfn, nid); + if (!pfn) + goto fail; + } } + hotplug_memory_notifier(page_cgroup_callback, 0); printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" " want memory cgroups\n"); + return; +fail: + printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); + panic("Out of memory"); } void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) -- 1.7.4.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>