On Mon, May 14, 2012 at 4:05 PM, Marc Zyngier <marc.zyngier at arm.com> wrote: > Until now, we do not track how many references the page tables > contain, which makes them impossible to free from the MMU notifiers. > > By updating the page table pages' refcounts each time we fault in > or evict a page, it becomes easy to track the state of the various > page tables, and to free them when them become empty. s/them/they/ > > Only the top PGD is never freed. s/top/level-1/ ? > > Signed-off-by: Marc Zyngier <marc.zyngier at arm.com> > --- > ?arch/arm/kvm/mmu.c | ? 72 ++++++++++++++++++++++++++++++++++++++++++++++------ > ?1 files changed, 64 insertions(+), 8 deletions(-) > > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > index 889b2ee..5353ad4 100644 > --- a/arch/arm/kvm/mmu.c > +++ b/arch/arm/kvm/mmu.c > @@ -199,13 +199,16 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) > ?static void free_guest_pages(pte_t *pte, unsigned long addr) > ?{ > ? ? ? ?unsigned int i; > - ? ? ? struct page *page; > + ? ? ? struct page *page, *pte_page; > + > + ? ? ? pte_page = virt_to_page(pte); > > ? ? ? ?for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { > ? ? ? ? ? ? ? ?if (!pte_present(*pte)) > ? ? ? ? ? ? ? ? ? ? ? ?goto next_page; > ? ? ? ? ? ? ? ?page = pfn_to_page(pte_pfn(*pte)); > ? ? ? ? ? ? ? ?put_page(page); > + ? ? ? ? ? ? ? put_page(pte_page); > ?next_page: > ? ? ? ? ? ? ? ?pte++; > ? ? ? ?} > @@ -215,7 +218,9 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) > ?{ > ? ? ? ?unsigned int i; > ? ? ? ?pte_t *pte; > - ? ? ? struct page *page; > + ? ? ? struct page *page, *pmd_page; > + > + ? ? ? pmd_page = virt_to_page(pmd); > > ? ? ? ?for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { > ? ? ? ? ? ? ? ?BUG_ON(pmd_sect(*pmd)); > @@ -223,8 +228,10 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) > ? ? ? ? ? ? ? ? ? ? ? ?pte = pte_offset_kernel(pmd, addr); > ? ? ? ? ? ? ? ? ? ? ? ?free_guest_pages(pte, addr); > ? ? ? ? ? ? ? ? ? ? ? ?page = virt_to_page((void *)pte); > - ? ? ? ? ? ? ? ? ? ? ? WARN_ON(atomic_read(&page->_count) != 1); > + ? ? ? ? ? ? ? ? ? ? ? WARN_ON(page_count(page) != 1); > ? ? ? ? ? ? ? ? ? ? ? ?pte_free_kernel(NULL, pte); > + > + ? ? ? ? ? ? ? ? ? ? ? put_page(pmd_page); > ? ? ? ? ? ? ? ?} > ? ? ? ? ? ? ? ?pmd++; > ? ? ? ?} > @@ -247,6 +254,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) > ? ? ? ?pud_t *pud; > ? ? ? ?pmd_t *pmd; > ? ? ? ?unsigned long long i, addr; > + ? ? ? struct page *page, *pud_page; > > ? ? ? ?if (kvm->arch.pgd == NULL) > ? ? ? ? ? ? ? ?return; > @@ -260,6 +268,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) > ? ? ? ? ? ? ? ?addr = i * (unsigned long long)PGDIR_SIZE; > ? ? ? ? ? ? ? ?pgd = kvm->arch.pgd + i; > ? ? ? ? ? ? ? ?pud = pud_offset(pgd, addr); > + ? ? ? ? ? ? ? pud_page = virt_to_page(pud); > > ? ? ? ? ? ? ? ?if (pud_none(*pud)) > ? ? ? ? ? ? ? ? ? ? ? ?continue; > @@ -268,14 +277,60 @@ void kvm_free_stage2_pgd(struct kvm *kvm) > > ? ? ? ? ? ? ? ?pmd = pmd_offset(pud, addr); > ? ? ? ? ? ? ? ?free_stage2_ptes(pmd, addr); > + ? ? ? ? ? ? ? page = virt_to_page((void *)pmd); > + ? ? ? ? ? ? ? WARN_ON(page_count(page) != 1); > ? ? ? ? ? ? ? ?pmd_free(NULL, pmd); > + ? ? ? ? ? ? ? put_page(pud_page); > ? ? ? ?} > > + ? ? ? WARN_ON(page_count(pud_page) != 1); > ? ? ? ?free_pages((unsigned long)kvm->arch.pgd, PGD2_ORDER); > ? ? ? ?kvm->arch.pgd = NULL; > ?} > > -static const pte_t null_pte; > +static int stage2_clear_pte(struct kvm *kvm, phys_addr_t addr) > +{ > + ? ? ? pgd_t *pgd; > + ? ? ? pud_t *pud; > + ? ? ? pmd_t *pmd; > + ? ? ? pte_t *pte; > + ? ? ? struct page *page; > + > + ? ? ? pgd = kvm->arch.pgd + pgd_index(addr); > + ? ? ? pud = pud_offset(pgd, addr); > + ? ? ? BUG_ON(pud_none(*pud)); > + > + ? ? ? pmd = pmd_offset(pud, addr); > + ? ? ? BUG_ON(pmd_none(*pmd)); > + > + ? ? ? pte = pte_offset_kernel(pmd, addr); > + ? ? ? set_pte_ext(pte, __pte(0), 0); > + > + ? ? ? page = virt_to_page(pte); > + ? ? ? put_page(page); > + ? ? ? if (page_count(page) != 1) > + ? ? ? ? ? ? ? return 1; ? ? ? /* TLB invalidate required */ > + what's the distinction between returning 1 or 0 here? why not simply return 1 in all subsequent cases and let the caller free this? If this is for synchronization reasons and I fail to see it, then perhaps there should be a comment explaining it? In fact, what happens if another VCPU takes a page fault while this is running and page_count(page) hits 1, but before pmd_clear a new page has been allocated there. Don't we leak memory in this case? > + ? ? ? /* Need to remove pte page */ > + ? ? ? pmd_clear(pmd); > + ? ? ? __kvm_tlb_flush_vmid(kvm); > + ? ? ? pte_free_kernel(NULL, (pte_t *)((unsigned long)pte & PAGE_MASK)); > + > + ? ? ? page = virt_to_page(pmd); > + ? ? ? put_page(page); > + ? ? ? if (page_count(page) != 1) > + ? ? ? ? ? ? ? return 0; ? ? ? /* TLB already invalidated */ > + > + ? ? ? /* Need to remove pmd page */ > + ? ? ? pud_clear(pud); > + ? ? ? __kvm_tlb_flush_vmid(kvm); > + ? ? ? pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK)); > + > + ? ? ? page = virt_to_page(pud); > + ? ? ? put_page(page); > + > + ? ? ? return 0; ? ? ? ? ? ? ? /* TLB already invalidated */ > +} > > ?static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pte) > ?{ > @@ -288,7 +343,6 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pt > ? ? ? ?pgd = kvm->arch.pgd + pgd_index(addr); > ? ? ? ?pud = pud_offset(pgd, addr); > ? ? ? ?if (pud_none(*pud)) { > - ? ? ? ? ? ? ? BUG_ON(new_pte == &null_pte); > ? ? ? ? ? ? ? ?pmd = pmd_alloc_one(NULL, addr); > ? ? ? ? ? ? ? ?if (!pmd) { > ? ? ? ? ? ? ? ? ? ? ? ?kvm_err("Cannot allocate 2nd stage pmd\n"); > @@ -296,12 +350,12 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pt > ? ? ? ? ? ? ? ?} > ? ? ? ? ? ? ? ?pud_populate(NULL, pud, pmd); > ? ? ? ? ? ? ? ?pmd += pmd_index(addr); > + ? ? ? ? ? ? ? get_page(virt_to_page(pud)); > ? ? ? ?} else > ? ? ? ? ? ? ? ?pmd = pmd_offset(pud, addr); > > ? ? ? ?/* Create 2nd stage page table mapping - Level 2 */ > ? ? ? ?if (pmd_none(*pmd)) { > - ? ? ? ? ? ? ? BUG_ON(new_pte == &null_pte); > ? ? ? ? ? ? ? ?pte = pte_alloc_one_kernel(NULL, addr); > ? ? ? ? ? ? ? ?if (!pte) { > ? ? ? ? ? ? ? ? ? ? ? ?kvm_err("Cannot allocate 2nd stage pte\n"); > @@ -309,11 +363,13 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pt > ? ? ? ? ? ? ? ?} > ? ? ? ? ? ? ? ?pmd_populate_kernel(NULL, pmd, pte); > ? ? ? ? ? ? ? ?pte += pte_index(addr); > + ? ? ? ? ? ? ? get_page(virt_to_page(pmd)); > ? ? ? ?} else > ? ? ? ? ? ? ? ?pte = pte_offset_kernel(pmd, addr); > > ? ? ? ?/* Create 2nd stage page table mapping - Level 3 */ > ? ? ? ?set_pte_ext(pte, *new_pte, 0); > + ? ? ? get_page(virt_to_page(pte)); don't we need to check if !pte_none(pte) before setting it, and not call get_page in that case? seems we should have a BUG_ON or something otherwise. > > ? ? ? ?return 0; > ?} > @@ -567,9 +623,9 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) > ? ? ? ? ? ? ? ? ? ? ? ?gpa_t gpa_offset = hva - start; > ? ? ? ? ? ? ? ? ? ? ? ?gpa_t gpa = (memslot->base_gfn << PAGE_SHIFT) + gpa_offset; > > - ? ? ? ? ? ? ? ? ? ? ? stage2_set_pte(kvm, gpa, &null_pte); > - ? ? ? ? ? ? ? ? ? ? ? needs_stage2_flush = 1; > + ? ? ? ? ? ? ? ? ? ? ? needs_stage2_flush = stage2_clear_pte(kvm, gpa); > ? ? ? ? ? ? ? ?} > + > ? ? ? ?} > > ? ? ? ?if (needs_stage2_flush) > -- > 1.7.7.1