On 2013/1/29 21:02, Simon Jeons wrote: > Hi Tang, > On Wed, 2013-01-09 at 17:32 +0800, Tang Chen wrote: >> From: Wen Congyang <wency@xxxxxxxxxxxxxx> >> >> When memory is removed, the corresponding pagetables should alse be removed. >> This patch introduces some common APIs to support vmemmap pagetable and x86_64 >> architecture pagetable removing. >> > > When page table of hot-add memory is created? Hi Simon, For x86_64, page table of hot-add memory is created by: add_memory->arch_add_memory->init_memory_mapping->kernel_physical_mapping_init > >> All pages of virtual mapping in removed memory cannot be freedi if some pages >> used as PGD/PUD includes not only removed memory but also other memory. So the >> patch uses the following way to check whether page can be freed or not. >> >> 1. When removing memory, the page structs of the revmoved memory are filled >> with 0FD. >> 2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared. >> In this case, the page used as PT/PMD can be freed. >> >> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@xxxxxxxxxxxxxx> >> Signed-off-by: Jianguo Wu <wujianguo@xxxxxxxxxx> >> Signed-off-by: Wen Congyang <wency@xxxxxxxxxxxxxx> >> Signed-off-by: Tang Chen <tangchen@xxxxxxxxxxxxxx> >> --- >> arch/x86/include/asm/pgtable_types.h | 1 + >> arch/x86/mm/init_64.c | 299 ++++++++++++++++++++++++++++++++++ >> arch/x86/mm/pageattr.c | 47 +++--- >> include/linux/bootmem.h | 1 + >> 4 files changed, 326 insertions(+), 22 deletions(-) >> >> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h >> index 3c32db8..4b6fd2a 100644 >> --- a/arch/x86/include/asm/pgtable_types.h >> +++ b/arch/x86/include/asm/pgtable_types.h >> @@ -352,6 +352,7 @@ static inline void update_page_count(int level, unsigned long pages) { } >> * as a pte too. >> */ >> extern pte_t *lookup_address(unsigned long address, unsigned int *level); >> +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase); >> >> #endif /* !__ASSEMBLY__ */ >> >> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >> index 9ac1723..fe01116 100644 >> --- a/arch/x86/mm/init_64.c >> +++ b/arch/x86/mm/init_64.c >> @@ -682,6 +682,305 @@ int arch_add_memory(int nid, u64 start, u64 size) >> } >> EXPORT_SYMBOL_GPL(arch_add_memory); >> >> +#define PAGE_INUSE 0xFD >> + >> +static void __meminit free_pagetable(struct page *page, int order) >> +{ >> + struct zone *zone; >> + bool bootmem = false; >> + unsigned long magic; >> + unsigned int nr_pages = 1 << order; >> + >> + /* bootmem page has reserved flag */ >> + if (PageReserved(page)) { >> + __ClearPageReserved(page); >> + bootmem = true; >> + >> + magic = (unsigned long)page->lru.next; >> + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { >> + while (nr_pages--) >> + put_page_bootmem(page++); >> + } else >> + __free_pages_bootmem(page, order); >> + } else >> + free_pages((unsigned long)page_address(page), order); >> + >> + /* >> + * SECTION_INFO pages and MIX_SECTION_INFO pages >> + * are all allocated by bootmem. >> + */ >> + if (bootmem) { >> + zone = page_zone(page); >> + zone_span_writelock(zone); >> + zone->present_pages += nr_pages; >> + zone_span_writeunlock(zone); >> + totalram_pages += nr_pages; >> + } >> +} >> + >> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) >> +{ >> + pte_t *pte; >> + int i; >> + >> + for (i = 0; i < PTRS_PER_PTE; i++) { >> + pte = pte_start + i; >> + if (pte_val(*pte)) >> + return; >> + } >> + >> + /* free a pte talbe */ >> + free_pagetable(pmd_page(*pmd), 0); >> + spin_lock(&init_mm.page_table_lock); >> + pmd_clear(pmd); >> + spin_unlock(&init_mm.page_table_lock); >> +} >> + >> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) >> +{ >> + pmd_t *pmd; >> + int i; >> + >> + for (i = 0; i < PTRS_PER_PMD; i++) { >> + pmd = pmd_start + i; >> + if (pmd_val(*pmd)) >> + return; >> + } >> + >> + /* free a pmd talbe */ >> + free_pagetable(pud_page(*pud), 0); >> + spin_lock(&init_mm.page_table_lock); >> + pud_clear(pud); >> + spin_unlock(&init_mm.page_table_lock); >> +} >> + >> +/* Return true if pgd is changed, otherwise return false. */ >> +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) >> +{ >> + pud_t *pud; >> + int i; >> + >> + for (i = 0; i < PTRS_PER_PUD; i++) { >> + pud = pud_start + i; >> + if (pud_val(*pud)) >> + return false; >> + } >> + >> + /* free a pud table */ >> + free_pagetable(pgd_page(*pgd), 0); >> + spin_lock(&init_mm.page_table_lock); >> + pgd_clear(pgd); >> + spin_unlock(&init_mm.page_table_lock); >> + >> + return true; >> +} >> + >> +static void __meminit >> +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, >> + bool direct) >> +{ >> + unsigned long next, pages = 0; >> + pte_t *pte; >> + void *page_addr; >> + phys_addr_t phys_addr; >> + >> + pte = pte_start + pte_index(addr); >> + for (; addr < end; addr = next, pte++) { >> + next = (addr + PAGE_SIZE) & PAGE_MASK; >> + if (next > end) >> + next = end; >> + >> + if (!pte_present(*pte)) >> + continue; >> + >> + /* >> + * We mapped [0,1G) memory as identity mapping when >> + * initializing, in arch/x86/kernel/head_64.S. These >> + * pagetables cannot be removed. >> + */ >> + phys_addr = pte_val(*pte) + (addr & PAGE_MASK); >> + if (phys_addr < (phys_addr_t)0x40000000) >> + return; >> + >> + if (IS_ALIGNED(addr, PAGE_SIZE) && >> + IS_ALIGNED(next, PAGE_SIZE)) { >> + if (!direct) { >> + free_pagetable(pte_page(*pte), 0); >> + pages++; >> + } >> + >> + spin_lock(&init_mm.page_table_lock); >> + pte_clear(&init_mm, addr, pte); >> + spin_unlock(&init_mm.page_table_lock); >> + } else { >> + /* >> + * If we are not removing the whole page, it means >> + * other ptes in this page are being used and we canot >> + * remove them. So fill the unused ptes with 0xFD, and >> + * remove the page when it is wholly filled with 0xFD. >> + */ >> + memset((void *)addr, PAGE_INUSE, next - addr); >> + page_addr = page_address(pte_page(*pte)); >> + >> + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { >> + free_pagetable(pte_page(*pte), 0); >> + pages++; >> + >> + spin_lock(&init_mm.page_table_lock); >> + pte_clear(&init_mm, addr, pte); >> + spin_unlock(&init_mm.page_table_lock); >> + } >> + } >> + } >> + >> + /* Call free_pte_table() in remove_pmd_table(). */ >> + flush_tlb_all(); >> + if (direct) >> + update_page_count(PG_LEVEL_4K, -pages); >> +} >> + >> +static void __meminit >> +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, >> + bool direct) >> +{ >> + unsigned long pte_phys, next, pages = 0; >> + pte_t *pte_base; >> + pmd_t *pmd; >> + >> + pmd = pmd_start + pmd_index(addr); >> + for (; addr < end; addr = next, pmd++) { >> + next = pmd_addr_end(addr, end); >> + >> + if (!pmd_present(*pmd)) >> + continue; >> + >> + if (pmd_large(*pmd)) { >> + if (IS_ALIGNED(addr, PMD_SIZE) && >> + IS_ALIGNED(next, PMD_SIZE)) { >> + if (!direct) { >> + free_pagetable(pmd_page(*pmd), >> + get_order(PMD_SIZE)); >> + pages++; >> + } >> + >> + spin_lock(&init_mm.page_table_lock); >> + pmd_clear(pmd); >> + spin_unlock(&init_mm.page_table_lock); >> + continue; >> + } >> + >> + /* >> + * We use 2M page, but we need to remove part of them, >> + * so split 2M page to 4K page. >> + */ >> + pte_base = (pte_t *)alloc_low_page(&pte_phys); >> + BUG_ON(!pte_base); >> + __split_large_page((pte_t *)pmd, addr, >> + (pte_t *)pte_base); >> + >> + spin_lock(&init_mm.page_table_lock); >> + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); >> + spin_unlock(&init_mm.page_table_lock); >> + >> + flush_tlb_all(); >> + } >> + >> + pte_base = (pte_t *)map_low_page((pte_t *)pmd_page_vaddr(*pmd)); >> + remove_pte_table(pte_base, addr, next, direct); >> + free_pte_table(pte_base, pmd); >> + unmap_low_page(pte_base); >> + } >> + >> + /* Call free_pmd_table() in remove_pud_table(). */ >> + if (direct) >> + update_page_count(PG_LEVEL_2M, -pages); >> +} >> + >> +static void __meminit >> +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, >> + bool direct) >> +{ >> + unsigned long pmd_phys, next, pages = 0; >> + pmd_t *pmd_base; >> + pud_t *pud; >> + >> + pud = pud_start + pud_index(addr); >> + for (; addr < end; addr = next, pud++) { >> + next = pud_addr_end(addr, end); >> + >> + if (!pud_present(*pud)) >> + continue; >> + >> + if (pud_large(*pud)) { >> + if (IS_ALIGNED(addr, PUD_SIZE) && >> + IS_ALIGNED(next, PUD_SIZE)) { >> + if (!direct) { >> + free_pagetable(pud_page(*pud), >> + get_order(PUD_SIZE)); >> + pages++; >> + } >> + >> + spin_lock(&init_mm.page_table_lock); >> + pud_clear(pud); >> + spin_unlock(&init_mm.page_table_lock); >> + continue; >> + } >> + >> + /* >> + * We use 1G page, but we need to remove part of them, >> + * so split 1G page to 2M page. >> + */ >> + pmd_base = (pmd_t *)alloc_low_page(&pmd_phys); >> + BUG_ON(!pmd_base); >> + __split_large_page((pte_t *)pud, addr, >> + (pte_t *)pmd_base); >> + >> + spin_lock(&init_mm.page_table_lock); >> + pud_populate(&init_mm, pud, __va(pmd_phys)); >> + spin_unlock(&init_mm.page_table_lock); >> + >> + flush_tlb_all(); >> + } >> + >> + pmd_base = (pmd_t *)map_low_page((pmd_t *)pud_page_vaddr(*pud)); >> + remove_pmd_table(pmd_base, addr, next, direct); >> + free_pmd_table(pmd_base, pud); >> + unmap_low_page(pmd_base); >> + } >> + >> + if (direct) >> + update_page_count(PG_LEVEL_1G, -pages); >> +} >> + >> +/* start and end are both virtual address. */ >> +static void __meminit >> +remove_pagetable(unsigned long start, unsigned long end, bool direct) >> +{ >> + unsigned long next; >> + pgd_t *pgd; >> + pud_t *pud; >> + bool pgd_changed = false; >> + >> + for (; start < end; start = next) { >> + pgd = pgd_offset_k(start); >> + if (!pgd_present(*pgd)) >> + continue; >> + >> + next = pgd_addr_end(start, end); >> + >> + pud = (pud_t *)map_low_page((pud_t *)pgd_page_vaddr(*pgd)); >> + remove_pud_table(pud, start, next, direct); >> + if (free_pud_table(pud, pgd)) >> + pgd_changed = true; >> + unmap_low_page(pud); >> + } >> + >> + if (pgd_changed) >> + sync_global_pgds(start, end - 1); >> + >> + flush_tlb_all(); >> +} >> + >> #ifdef CONFIG_MEMORY_HOTREMOVE >> int __ref arch_remove_memory(u64 start, u64 size) >> { >> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c >> index a718e0d..7dcb6f9 100644 >> --- a/arch/x86/mm/pageattr.c >> +++ b/arch/x86/mm/pageattr.c >> @@ -501,21 +501,13 @@ out_unlock: >> return do_split; >> } >> >> -static int split_large_page(pte_t *kpte, unsigned long address) >> +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase) >> { >> unsigned long pfn, pfninc = 1; >> unsigned int i, level; >> - pte_t *pbase, *tmp; >> + pte_t *tmp; >> pgprot_t ref_prot; >> - struct page *base; >> - >> - if (!debug_pagealloc) >> - spin_unlock(&cpa_lock); >> - base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); >> - if (!debug_pagealloc) >> - spin_lock(&cpa_lock); >> - if (!base) >> - return -ENOMEM; >> + struct page *base = virt_to_page(pbase); >> >> spin_lock(&pgd_lock); >> /* >> @@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte, unsigned long address) >> * up for us already: >> */ >> tmp = lookup_address(address, &level); >> - if (tmp != kpte) >> - goto out_unlock; >> + if (tmp != kpte) { >> + spin_unlock(&pgd_lock); >> + return 1; >> + } >> >> - pbase = (pte_t *)page_address(base); >> paravirt_alloc_pte(&init_mm, page_to_pfn(base)); >> ref_prot = pte_pgprot(pte_clrhuge(*kpte)); >> /* >> @@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte, unsigned long address) >> * going on. >> */ >> __flush_tlb_all(); >> + spin_unlock(&pgd_lock); >> >> - base = NULL; >> + return 0; >> +} >> >> -out_unlock: >> - /* >> - * If we dropped out via the lookup_address check under >> - * pgd_lock then stick the page back into the pool: >> - */ >> - if (base) >> +static int split_large_page(pte_t *kpte, unsigned long address) >> +{ >> + pte_t *pbase; >> + struct page *base; >> + >> + if (!debug_pagealloc) >> + spin_unlock(&cpa_lock); >> + base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); >> + if (!debug_pagealloc) >> + spin_lock(&cpa_lock); >> + if (!base) >> + return -ENOMEM; >> + >> + pbase = (pte_t *)page_address(base); >> + if (__split_large_page(kpte, address, pbase)) >> __free_page(base); >> - spin_unlock(&pgd_lock); >> >> return 0; >> } >> diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h >> index 3f778c2..190ff06 100644 >> --- a/include/linux/bootmem.h >> +++ b/include/linux/bootmem.h >> @@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat, >> unsigned long size); >> extern void free_bootmem(unsigned long physaddr, unsigned long size); >> extern void free_bootmem_late(unsigned long physaddr, unsigned long size); >> +extern void __free_pages_bootmem(struct page *page, unsigned int order); >> >> /* >> * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, > > > > . > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>