On Tue, Jul 31, 2012 at 10:44 AM, Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> wrote: > On Thu, Jul 5, 2012 at 5:04 PM, Marc Zyngier <marc.zyngier@xxxxxxx> wrote: >> Until now, we do not track how many references the page tables >> contain, which makes them impossible to free from the MMU notifiers. >> >> By updating the page table pages' refcounts each time we fault in >> or evict a page, it becomes easy to track the state of the various >> page tables, and to free them when they become empty. >> >> Only the level-1 PGD is never freed. >> >> Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx> >> --- >> arch/arm/kvm/mmu.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++------ >> 1 file changed, 76 insertions(+), 10 deletions(-) >> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >> index 4b174e6..33d34ea 100644 >> --- a/arch/arm/kvm/mmu.c >> +++ b/arch/arm/kvm/mmu.c >> @@ -200,12 +200,15 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) >> static void free_guest_pages(pte_t *pte, unsigned long addr) >> { >> unsigned int i; >> - struct page *page; >> + struct page *page, *pte_page; >> + >> + pte_page = virt_to_page(pte); >> >> for (i = 0; i < PTRS_PER_PTE; i++) { >> if (pte_present(*pte)) { >> page = pfn_to_page(pte_pfn(*pte)); >> put_page(page); >> + put_page(pte_page); >> } >> pte++; >> } >> @@ -215,7 +218,9 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) >> { >> unsigned int i; >> pte_t *pte; >> - struct page *page; >> + struct page *page, *pmd_page; >> + >> + pmd_page = virt_to_page(pmd); >> >> for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { >> BUG_ON(pmd_sect(*pmd)); >> @@ -223,8 +228,10 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) >> pte = pte_offset_kernel(pmd, addr); >> free_guest_pages(pte, addr); >> page = virt_to_page((void *)pte); >> - WARN_ON(atomic_read(&page->_count) != 1); >> + WARN_ON(page_count(page) != 1); > > while you're add it, why not move this check to free_guest_pages and > avoid the page variable all together here? > >> pte_free_kernel(NULL, pte); >> + >> + put_page(pmd_page); >> } >> pmd++; >> } >> @@ -247,6 +254,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) >> pud_t *pud; >> pmd_t *pmd; >> unsigned long long i, addr; >> + struct page *page, *pud_page; >> >> if (kvm->arch.pgd == NULL) >> return; >> @@ -260,6 +268,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) >> addr = i * (unsigned long long)PGDIR_SIZE; >> pgd = kvm->arch.pgd + i; >> pud = pud_offset(pgd, addr); >> + pud_page = virt_to_page(pud); >> >> if (pud_none(*pud)) >> continue; >> @@ -268,14 +277,70 @@ void kvm_free_stage2_pgd(struct kvm *kvm) >> >> pmd = pmd_offset(pud, addr); >> free_stage2_ptes(pmd, addr); >> + page = virt_to_page((void *)pmd); >> + WARN_ON(page_count(page) != 1); >> pmd_free(NULL, pmd); >> + put_page(pud_page); >> } >> >> + WARN_ON(page_count(pud_page) != 1); >> free_pages((unsigned long)kvm->arch.pgd, PGD2_ORDER); >> kvm->arch.pgd = NULL; >> } >> >> -static const pte_t null_pte; >> +/* >> + * Clear a stage-2 PTE, lowering the various ref-counts. Also takes >> + * care of invalidating the TLBs. Must be called while holding >> + * pgd_lock, otherwise another faulting VCPU may come in and mess >> + * things behind our back. >> + */ > > you could consider using the kdoc format. > >> +static void stage2_clear_pte(struct kvm *kvm, phys_addr_t addr) >> +{ >> + pgd_t *pgd; >> + pud_t *pud; >> + pmd_t *pmd; >> + pte_t *pte; >> + struct page *page; >> + >> + pr_info("Clearing PTE&%08llx\n", addr); > > is this still in debug stage, should it be a trace event, or should it just go? > > also, there are the nice kvm_XXX print macros ;) > >> + pgd = kvm->arch.pgd + pgd_index(addr); >> + pud = pud_offset(pgd, addr); >> + BUG_ON(pud_none(*pud)); >> + >> + pmd = pmd_offset(pud, addr); >> + BUG_ON(pmd_none(*pmd)); >> + >> + pte = pte_offset_kernel(pmd, addr); >> + set_pte_ext(pte, __pte(0), 0); >> + >> + page = virt_to_page(pte); >> + put_page(page); >> + if (page_count(page) != 1) { >> + __kvm_tlb_flush_vmid(kvm); >> + return; >> + } >> + >> + /* Need to remove pte page */ >> + pmd_clear(pmd); >> + __kvm_tlb_flush_vmid(kvm); >> + pte_free_kernel(NULL, (pte_t *)((unsigned long)pte & PAGE_MASK)); >> + >> + page = virt_to_page(pmd); >> + put_page(page); >> + if (page_count(page) != 1) >> + return; >> + >> + /* >> + * Need to remove pmd page. This is the worse case, and we end > > s/worse/worst/ > >> + * up invalidating TLB twice. Should we care, really? > > s/invalidating TLB/invalidating the TLB/ > >> + */ > > not sure for performance, but for readability this is weird, imho. > > why not just have an out_invalidate label and replace the return's > with a goto. Standard practice, no? > never mind, of course we need to clear the TLB before we can free the object. >> + pud_clear(pud); >> + __kvm_tlb_flush_vmid(kvm); >> + pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK)); >> + >> + page = virt_to_page(pud); >> + put_page(page); >> +} >> >> static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, >> const pte_t *new_pte) >> @@ -289,7 +354,6 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, >> pgd = kvm->arch.pgd + pgd_index(addr); >> pud = pud_offset(pgd, addr); >> if (pud_none(*pud)) { >> - BUG_ON(new_pte == &null_pte); >> pmd = pmd_alloc_one(NULL, addr); >> if (!pmd) { >> kvm_err("Cannot allocate 2nd stage pmd\n"); >> @@ -297,12 +361,12 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, >> } >> pud_populate(NULL, pud, pmd); >> pmd += pmd_index(addr); >> + get_page(virt_to_page(pud)); >> } else >> pmd = pmd_offset(pud, addr); >> >> /* Create 2nd stage page table mapping - Level 2 */ >> if (pmd_none(*pmd)) { >> - BUG_ON(new_pte == &null_pte); >> pte = pte_alloc_one_kernel(NULL, addr); >> if (!pte) { >> kvm_err("Cannot allocate 2nd stage pte\n"); >> @@ -310,11 +374,14 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, >> } >> pmd_populate_kernel(NULL, pmd, pte); >> pte += pte_index(addr); >> + get_page(virt_to_page(pmd)); >> } else >> pte = pte_offset_kernel(pmd, addr); >> >> /* Create 2nd stage page table mapping - Level 3 */ >> + BUG_ON(pte_none(pte)); >> set_pte_ext(pte, *new_pte, 0); >> + get_page(virt_to_page(pte)); >> >> return 0; >> } >> @@ -597,6 +664,7 @@ static bool hva_to_gpa(struct kvm *kvm, unsigned long hva, gpa_t *gpa) >> /* no overlapping memslots allowed: break */ >> break; >> } >> + > > stray whitespace? > >> } >> >> mutex_unlock(&kvm->slots_lock); >> @@ -613,10 +681,8 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) >> >> spin_lock(&kvm->arch.pgd_lock); >> found = hva_to_gpa(kvm, hva, &gpa); >> - if (found) { >> - stage2_set_pte(kvm, gpa, &null_pte); >> - __kvm_tlb_flush_vmid(kvm); >> - } >> + if (found) >> + stage2_clear_pte(kvm, gpa); >> spin_unlock(&kvm->arch.pgd_lock); >> return 0; >> } >> -- I've applied this (needed it for other purposes) with a bit of fixup. thanks, -Christoffer _______________________________________________ kvmarm mailing list kvmarm@xxxxxxxxxxxxxxxxxxxxx https://lists.cs.columbia.edu/cucslists/listinfo/kvmarm