Chris Wright wrote: >* Zachary Amsden (zach@xxxxxxxxxx) wrote: > > >>Chris Wright wrote: >> >> >> >>>* Zachary Amsden (zach@xxxxxxxxxx) wrote: >>> >>> >>> >>> >>>>--- linux-2.6.13.orig/arch/i386/mm/init.c 2005-08-24 >>>>09:31:05.000000000 -0700 >>>>+++ linux-2.6.13/arch/i386/mm/init.c 2005-08-24 09:31:31.000000000 -0700 >>>>@@ -79,6 +79,7 @@ static pte_t * __init one_page_table_ini >>>>{ >>>> if (pmd_none(*pmd)) { >>>> pte_t *page_table = (pte_t *) >>>> alloc_bootmem_low_pages(PAGE_SIZE); >>>>+ SetPagePTE(virt_to_page(page_table)); >>>> >>>> >>>> >>>> >>>Xen has this on one_md_table_init() as well for pmd. >>> >>> >>I'll add that in another patch. It's easy to miss some of the init time >>call sites (we don't actually depend on them for correctness). >> >> > >You could just respin this patch. > > Done. Looks like you want empty_zero_page write protected too. That seems like a fine idea to me unless something really wants to do atomic 64-bit reads on it. >>>> >>>> >>>Why the switch of kmem_cache_free call? >>> >>> >>Because pgd_val(pgd[i])-1 is confusing. >> >> > >Yes, I agree it is. That's why I was wondering about the ClearPagePDE calls >which don't use them. > > The -1 is quite useless when you're going to shift >> 12 anyways to get a frame number to index into mem_map, which is why they are not there. Plus, it just seems scary if you got it wrong - let's say you had a not present page - not that you could, but now you are freeing a misaligned address in the _previous_ page. I really don't like that -1 at all. I will clean it up, but it does certainly deserve another patch. > > >>Using (pgd_val(pgd[i]) - >>_PAGE_PRESENT) would be better, but the +/- 1s all over the place here >>could use some general cleanup as well. I smell a cleanup fit coming >>on. Using (pgd_val(pgd[i]) & PAGE_MASK) is a less error prone way to >>get the physical frame bits, since it is not wrong if you turn on PCD or >>PWD. >> >> > >Please save all that for later cleanup. As it stands it's just a conversion >of one random call site, which should be dropped from the patch. > > Working on it now. -------------- next part -------------- Hooks are provided for the mach-XXX subarchitecture at the time prior to a page being used as a page table at all levels, for PAE and non-PAE kernels. Note that in PAE mode, multiple PDP roots may exist on the same page with other data, so the root must be shadowed instead. This is not a performance issue, since PAE only uses 4 top level PDPEs. The hooks are: SetPagePTE(ppn) - indicates that a given physical page is going to be used as a page table. SetPagePDE(ppn) - indicates that a given physical page is going to be used as a page directory. ClearPageXXX(ppn) - indicates that the physical page is now done being used as a certain type of page. These hooks can be used in two ways; for shadow mode, they serve as requests to pre-allocate and deallocate shadow page tables, and for direct page table mode, they serve as write protect/unprotect requests. Signed-off-by: Zachary Amsden <zach@xxxxxxxxxx> Patch-subject: Add pagetable allocation notifiers Index: linux-2.6.13/arch/i386/mm/init.c =================================================================== --- linux-2.6.13.orig/arch/i386/mm/init.c 2005-08-24 14:34:30.000000000 -0700 +++ linux-2.6.13/arch/i386/mm/init.c 2005-08-24 14:35:50.000000000 -0700 @@ -59,6 +59,7 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + SetPagePDE(virt_to_page(pmd_table)); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); if (pmd_table != pmd_offset(pud, 0)) @@ -79,6 +80,7 @@ static pte_t * __init one_page_table_ini { if (pmd_none(*pmd)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + SetPagePTE(virt_to_page(page_table)); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); if (page_table != pte_offset_kernel(pmd, 0)) BUG(); @@ -313,6 +315,7 @@ static void __init pagetable_init (void) #ifdef CONFIG_X86_PAE int i; /* Init entries of the first-level page table to the zero page */ + SetPagePDE(virt_to_page(empty_zero_page)); for (i = 0; i < PTRS_PER_PGD; i++) set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); #endif Index: linux-2.6.13/arch/i386/mm/pageattr.c =================================================================== --- linux-2.6.13.orig/arch/i386/mm/pageattr.c 2005-08-24 14:34:30.000000000 -0700 +++ linux-2.6.13/arch/i386/mm/pageattr.c 2005-08-24 14:36:09.000000000 -0700 @@ -52,6 +52,7 @@ static struct page *split_large_page(uns address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); + SetPagePTE(virt_to_page(pbase)); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, addr == address ? prot : PAGE_KERNEL)); @@ -146,6 +147,7 @@ __change_page_attr(struct page *page, pg BUG_ON(!page_count(kpte_page)); if (cpu_has_pse && (page_count(kpte_page) == 1)) { + ClearPagePTE(virt_to_page(kpte)); list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } Index: linux-2.6.13/arch/i386/mm/pgtable.c =================================================================== --- linux-2.6.13.orig/arch/i386/mm/pgtable.c 2005-08-24 14:34:30.000000000 -0700 +++ linux-2.6.13/arch/i386/mm/pgtable.c 2005-08-24 14:35:50.000000000 -0700 @@ -209,6 +209,7 @@ void pgd_ctor(void *pgd, kmem_cache_t *c if (PTRS_PER_PMD == 1) { memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + SetPagePDE(virt_to_page(pgd)); spin_lock_irqsave(&pgd_lock, flags); } @@ -227,6 +228,7 @@ void pgd_dtor(void *pgd, kmem_cache_t *c { unsigned long flags; /* can be called from interrupt context */ + ClearPagePDE(virt_to_page(pgd)); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); @@ -244,13 +246,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); if (!pmd) goto out_oom; + SetPagePDE(virt_to_page(pmd)); set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } return pgd; out_oom: - for (i--; i >= 0; i--) + for (i--; i >= 0; i--) { + ClearPagePDE(pfn_to_page(pgd_val(pgd[i]) >> PAGE_SHIFT)); kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + } kmem_cache_free(pgd_cache, pgd); return NULL; } @@ -261,8 +266,10 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + ClearPagePDE(pfn_to_page(pgd_val(pgd[i]) >> PAGE_SHIFT)); kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + } /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } Index: linux-2.6.13/include/asm-i386/pgalloc.h =================================================================== --- linux-2.6.13.orig/include/asm-i386/pgalloc.h 2005-08-24 14:34:30.000000000 -0700 +++ linux-2.6.13/include/asm-i386/pgalloc.h 2005-08-24 14:35:50.000000000 -0700 @@ -5,14 +5,22 @@ #include <asm/fixmap.h> #include <linux/threads.h> #include <linux/mm.h> /* for struct page */ +#include <mach_pgalloc.h> -#define pmd_populate_kernel(mm, pmd, pte) \ - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) +#define pmd_populate_kernel(mm, pmd, pte) \ +do { \ + SetPagePTE(virt_to_page(pte)); \ + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ +} while (0) #define pmd_populate(mm, pmd, pte) \ +do { \ + SetPagePTE(pte); \ set_pmd(pmd, __pmd(_PAGE_TABLE + \ ((unsigned long long)page_to_pfn(pte) << \ - (unsigned long long) PAGE_SHIFT))) + (unsigned long long) PAGE_SHIFT))); \ +} while (0) + /* * Allocate and free page tables. */ @@ -33,7 +41,11 @@ static inline void pte_free(struct page } -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +#define __pte_free_tlb(tlb,pte) \ +do { \ + tlb_remove_page((tlb),(pte)); \ + ClearPagePTE(pte); \ +} while (0) #ifdef CONFIG_X86_PAE /* Index: linux-2.6.13/include/asm-i386/mach-default/mach_pgalloc.h =================================================================== --- linux-2.6.13.orig/include/asm-i386/mach-default/mach_pgalloc.h 2005-08-24 14:34:30.000000000 -0700 +++ linux-2.6.13/include/asm-i386/mach-default/mach_pgalloc.h 2005-08-24 14:35:50.000000000 -0700 @@ -1,7 +1,40 @@ +/* + * Copyright (C) 2005, VMware, Inc. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to zach@xxxxxxxxxx + * + */ #ifndef __ASM_MACH_PGALLOC_H #define __ASM_MACH_PGALLOC_H +/* + * These hooks allow the hypervisor to be notified about page type + * transition events. + */ + +#define SetPagePTE(_page) +#define ClearPagePTE(_page) +#define SetPagePDE(_page) +#define ClearPagePDE(_page) + #define SetPagesLDT(_va, _pages) #define ClearPagesLDT(_va, _pages) -#endif +#endif /* _ASM_MACH_PGALLOC_H */