On Wed, Nov 06, 2013 at 02:21:39AM +0000, Atsushi Kumagai wrote: > (2013/11/06 5:27), Vivek Goyal wrote: > > On Tue, Nov 05, 2013 at 09:45:32PM +0800, Jingbai Ma wrote: > >> This patch set intend to exclude unnecessary hugepages from vmcore dump file. > >> > >> This patch requires the kernel patch to export necessary data structures into > >> vmcore: "kexec: export hugepage data structure into vmcoreinfo" > >> http://lists.infradead.org/pipermail/kexec/2013-November/009997.html > >> > >> This patch introduce two new dump levels 32 and 64 to exclude all unused and > >> active hugepages. The level to exclude all unnecessary pages will be 127 now. > > > > Interesting. Why hugepages should be treated any differentely than normal > > pages? > > > > If user asked to filter out free page, then it should be filtered and > > it should not matter whether it is a huge page or not? > > I'm making a RFC patch of hugepages filtering based on such policy. > > I attach the prototype version. > It's able to filter out also THPs, and suitable for cyclic processing > because it depends on mem_map and looking up it can be divided into > cycles. This is the same idea as page_is_buddy(). > > So I think it's better. Agreed. Being able to treat hugepages in same manner as other pages sounds good. Jingbai, looks good to you? Thanks Vivek > > -- > Thanks > Atsushi Kumagai > > > From: Atsushi Kumagai <kumagai-atsushi at mxc.nes.nec.co.jp> > Date: Wed, 6 Nov 2013 10:10:43 +0900 > Subject: [PATCH] [RFC] Exclude hugepages. > > Signed-off-by: Atsushi Kumagai <kumagai-atsushi at mxc.nes.nec.co.jp> > --- > makedumpfile.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- > makedumpfile.h | 8 ++++ > 2 files changed, 125 insertions(+), 5 deletions(-) > > diff --git a/makedumpfile.c b/makedumpfile.c > index 428c53e..75b7123 100644 > --- a/makedumpfile.c > +++ b/makedumpfile.c > @@ -63,6 +63,7 @@ do { \ > > static void check_cyclic_buffer_overrun(void); > static void setup_page_is_buddy(void); > +static void setup_page_is_hugepage(void); > > void > initialize_tables(void) > @@ -270,6 +271,18 @@ update_mmap_range(off_t offset, int initial) { > } > > static int > +page_is_hugepage(unsigned long flags) { > + if (NUMBER(PG_head) != NOT_FOUND_NUMBER) { > + return isHead(flags); > + } else if (NUMBER(PG_tail) != NOT_FOUND_NUMBER) { > + return isTail(flags); > + }if (NUMBER(PG_compound) != NOT_FOUND_NUMBER) { > + return isCompound(flags); > + } > + return 0; > +} > + > +static int > is_mapped_with_mmap(off_t offset) { > > if (info->flag_usemmap > @@ -1107,6 +1120,8 @@ get_symbol_info(void) > SYMBOL_ARRAY_LENGTH_INIT(node_remap_start_pfn, > "node_remap_start_pfn"); > > + SYMBOL_INIT(free_huge_page, "free_huge_page"); > + > return TRUE; > } > > @@ -1214,11 +1229,19 @@ get_structure_info(void) > > ENUM_NUMBER_INIT(PG_lru, "PG_lru"); > ENUM_NUMBER_INIT(PG_private, "PG_private"); > + ENUM_NUMBER_INIT(PG_head, "PG_head"); > + ENUM_NUMBER_INIT(PG_tail, "PG_tail"); > + ENUM_NUMBER_INIT(PG_compound, "PG_compound"); > ENUM_NUMBER_INIT(PG_swapcache, "PG_swapcache"); > ENUM_NUMBER_INIT(PG_buddy, "PG_buddy"); > ENUM_NUMBER_INIT(PG_slab, "PG_slab"); > ENUM_NUMBER_INIT(PG_hwpoison, "PG_hwpoison"); > > + if (NUMBER(PG_head) == NOT_FOUND_NUMBER && > + NUMBER(PG_compound) == NOT_FOUND_NUMBER) > + /* Pre-2.6.26 kernels did not have pageflags */ > + NUMBER(PG_compound) = PG_compound_ORIGINAL; > + > ENUM_TYPE_SIZE_INIT(pageflags, "pageflags"); > > TYPEDEF_SIZE_INIT(nodemask_t, "nodemask_t"); > @@ -1603,6 +1626,7 @@ write_vmcoreinfo_data(void) > WRITE_SYMBOL("node_remap_start_vaddr", node_remap_start_vaddr); > WRITE_SYMBOL("node_remap_end_vaddr", node_remap_end_vaddr); > WRITE_SYMBOL("node_remap_start_pfn", node_remap_start_pfn); > + WRITE_SYMBOL("free_huge_page", free_huge_page); > > /* > * write the structure size of 1st kernel > @@ -1685,6 +1709,9 @@ write_vmcoreinfo_data(void) > > WRITE_NUMBER("PG_lru", PG_lru); > WRITE_NUMBER("PG_private", PG_private); > + WRITE_NUMBER("PG_head", PG_head); > + WRITE_NUMBER("PG_tail", PG_tail); > + WRITE_NUMBER("PG_compound", PG_compound); > WRITE_NUMBER("PG_swapcache", PG_swapcache); > WRITE_NUMBER("PG_buddy", PG_buddy); > WRITE_NUMBER("PG_slab", PG_slab); > @@ -1932,6 +1959,7 @@ read_vmcoreinfo(void) > READ_SYMBOL("node_remap_start_vaddr", node_remap_start_vaddr); > READ_SYMBOL("node_remap_end_vaddr", node_remap_end_vaddr); > READ_SYMBOL("node_remap_start_pfn", node_remap_start_pfn); > + READ_SYMBOL("free_huge_page", free_huge_page); > > READ_STRUCTURE_SIZE("page", page); > READ_STRUCTURE_SIZE("mem_section", mem_section); > @@ -2000,6 +2028,9 @@ read_vmcoreinfo(void) > > READ_NUMBER("PG_lru", PG_lru); > READ_NUMBER("PG_private", PG_private); > + READ_NUMBER("PG_head", PG_head); > + READ_NUMBER("PG_tail", PG_tail); > + READ_NUMBER("PG_compound", PG_compound); > READ_NUMBER("PG_swapcache", PG_swapcache); > READ_NUMBER("PG_slab", PG_slab); > READ_NUMBER("PG_buddy", PG_buddy); > @@ -3126,6 +3157,9 @@ out: > if (!get_value_for_old_linux()) > return FALSE; > > + /* Get page flags for compound pages */ > + setup_page_is_hugepage(); > + > /* use buddy identification of free pages whether cyclic or not */ > /* (this can reduce pages scan of 1TB memory from 60sec to 30sec) */ > if (info->dump_level & DL_EXCLUDE_FREE) > @@ -4197,6 +4231,23 @@ out: > "follow free lists instead of mem_map array.\n"); > } > > +static void > +setup_page_is_hugepage(void) > +{ > + if (NUMBER(PG_head) != NOT_FOUND_NUMBER) { > + if (NUMBER(PG_tail) == NOT_FOUND_NUMBER) { > + /* If PG_tail is not explicitly saved, then assume > + * that it immediately follows PG_head. > + */ > + NUMBER(PG_tail) = NUMBER(PG_head) + 1; > + } > + } else if ((NUMBER(PG_compound) != NOT_FOUND_NUMBER) > + && (info->dump_level & DL_EXCLUDE_USER_DATA)) { > + MSG("Compound page bit could not be determined: "); > + MSG("huge pages will NOT be filtered.\n"); > + } > +} > + > /* > * If using a dumpfile in kdump-compressed format as a source file > * instead of /proc/vmcore, 1st-bitmap of a new dumpfile must be > @@ -4404,8 +4455,9 @@ __exclude_unnecessary_pages(unsigned long mem_map, > unsigned long long pfn_read_start, pfn_read_end, index_pg; > unsigned char page_cache[SIZE(page) * PGMM_CACHED]; > unsigned char *pcache; > - unsigned int _count, _mapcount = 0; > + unsigned int _count, _mapcount = 0, compound_order = 0; > unsigned long flags, mapping, private = 0; > + unsigned long hugetlb_dtor; > > /* > * Refresh the buffer of struct page, when changing mem_map. > @@ -4459,6 +4511,27 @@ __exclude_unnecessary_pages(unsigned long mem_map, > flags = ULONG(pcache + OFFSET(page.flags)); > _count = UINT(pcache + OFFSET(page._count)); > mapping = ULONG(pcache + OFFSET(page.mapping)); > + > + if (index_pg < PGMM_CACHED - 1) { > + compound_order = ULONG(pcache + SIZE(page) + OFFSET(page.lru) > + + OFFSET(list_head.prev)); > + hugetlb_dtor = ULONG(pcache + SIZE(page) + OFFSET(page.lru) > + + OFFSET(list_head.next)); > + } else if (pfn + 1 < pfn_end) { > + unsigned char page_cache_next[SIZE(page)]; > + if (!readmem(VADDR, mem_map, page_cache_next, SIZE(page))) { > + ERRMSG("Can't read the buffer of struct page.\n"); > + return FALSE; > + } > + compound_order = ULONG(page_cache_next + OFFSET(page.lru) > + + OFFSET(list_head.prev)); > + hugetlb_dtor = ULONG(page_cache_next + OFFSET(page.lru) > + + OFFSET(list_head.next)); > + } else { > + compound_order = 0; > + hugetlb_dtor = 0; > + } > + > if (OFFSET(page._mapcount) != NOT_FOUND_STRUCTURE) > _mapcount = UINT(pcache + OFFSET(page._mapcount)); > if (OFFSET(page.private) != NOT_FOUND_STRUCTURE) > @@ -4497,6 +4570,10 @@ __exclude_unnecessary_pages(unsigned long mem_map, > && !isPrivate(flags) && !isAnon(mapping)) { > if (clear_bit_on_2nd_bitmap_for_kernel(pfn)) > pfn_cache++; > + /* > + * NOTE: If THP for cache is introduced, the check for > + * compound pages is needed here. > + */ > } > /* > * Exclude the cache page with the private page. > @@ -4506,14 +4583,49 @@ __exclude_unnecessary_pages(unsigned long mem_map, > && !isAnon(mapping)) { > if (clear_bit_on_2nd_bitmap_for_kernel(pfn)) > pfn_cache_private++; > + /* > + * NOTE: If THP for cache is introduced, the check for > + * compound pages is needed here. > + */ > } > /* > * Exclude the data page of the user process. > */ > - else if ((info->dump_level & DL_EXCLUDE_USER_DATA) > - && isAnon(mapping)) { > - if (clear_bit_on_2nd_bitmap_for_kernel(pfn)) > - pfn_user++; > + else if (info->dump_level & DL_EXCLUDE_USER_DATA) { > + /* > + * Exclude the anonnymous pages as user pages. > + */ > + if (isAnon(mapping)) { > + if (clear_bit_on_2nd_bitmap_for_kernel(pfn)) > + pfn_user++; > + > + /* > + * Check the compound page > + */ > + if (page_is_hugepage(flags) && compound_order > 0) { > + int i, nr_pages = 1 << compound_order; > + > + for (i = 1; i < nr_pages; ++i) { > + if (clear_bit_on_2nd_bitmap_for_kernel(pfn + i)) > + pfn_user++; > + } > + pfn += nr_pages - 2; > + mem_map += (nr_pages - 1) * SIZE(page); > + } > + } > + /* > + * Exclude the hugetlbfs pages as user pages. > + */ > + else if (hugetlb_dtor == SYMBOL(free_huge_page)) { > + int i, nr_pages = 1 << compound_order; > + > + for (i = 0; i < nr_pages; ++i) { > + if (clear_bit_on_2nd_bitmap_for_kernel(pfn + i)) > + pfn_user++; > + } > + pfn += nr_pages - 1; > + mem_map += (nr_pages - 1) * SIZE(page); > + } > } > /* > * Exclude the hwpoison page. > diff --git a/makedumpfile.h b/makedumpfile.h > index 3a7e61a..d6ee832 100644 > --- a/makedumpfile.h > +++ b/makedumpfile.h > @@ -74,6 +74,7 @@ int get_mem_type(void); > #define PG_lru_ORIGINAL (5) > #define PG_slab_ORIGINAL (7) > #define PG_private_ORIGINAL (11) /* Has something at ->private */ > +#define PG_compound_ORIGINAL (14) /* Is part of a compound page */ > #define PG_swapcache_ORIGINAL (15) /* Swap page: swp_entry_t in private */ > > #define PAGE_BUDDY_MAPCOUNT_VALUE_v2_6_38 (-2) > @@ -140,6 +141,9 @@ test_bit(int nr, unsigned long addr) > > #define isLRU(flags) test_bit(NUMBER(PG_lru), flags) > #define isPrivate(flags) test_bit(NUMBER(PG_private), flags) > +#define isHead(flags) test_bit(NUMBER(PG_head), flags) > +#define isTail(flags) test_bit(NUMBER(PG_tail), flags) > +#define isCompound(flags) test_bit(NUMBER(PG_compound), flags) > #define isSwapCache(flags) test_bit(NUMBER(PG_swapcache), flags) > #define isHWPOISON(flags) (test_bit(NUMBER(PG_hwpoison), flags) \ > && (NUMBER(PG_hwpoison) != NOT_FOUND_NUMBER)) > @@ -1124,6 +1128,7 @@ struct symbol_table { > unsigned long long node_remap_start_vaddr; > unsigned long long node_remap_end_vaddr; > unsigned long long node_remap_start_pfn; > + unsigned long long free_huge_page; > > /* > * for Xen extraction > @@ -1383,6 +1388,9 @@ struct number_table { > */ > long PG_lru; > long PG_private; > + long PG_head; > + long PG_tail; > + long PG_compound; > long PG_swapcache; > long PG_buddy; > long PG_slab; > -- > 1.8.0.2 >