Until now, we do not track how many references the page tables contain, which makes them impossible to free from the MMU notifiers. By updating the page table pages' refcounts each time we fault in or evict a page, it becomes easy to track the state of the various page tables, and to free them when them become empty. Only the top PGD is never freed. Signed-off-by: Marc Zyngier <marc.zyngier at arm.com> --- arch/arm/kvm/mmu.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 files changed, 64 insertions(+), 8 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 889b2ee..5353ad4 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -199,13 +199,16 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) static void free_guest_pages(pte_t *pte, unsigned long addr) { unsigned int i; - struct page *page; + struct page *page, *pte_page; + + pte_page = virt_to_page(pte); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { if (!pte_present(*pte)) goto next_page; page = pfn_to_page(pte_pfn(*pte)); put_page(page); + put_page(pte_page); next_page: pte++; } @@ -215,7 +218,9 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) { unsigned int i; pte_t *pte; - struct page *page; + struct page *page, *pmd_page; + + pmd_page = virt_to_page(pmd); for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { BUG_ON(pmd_sect(*pmd)); @@ -223,8 +228,10 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) pte = pte_offset_kernel(pmd, addr); free_guest_pages(pte, addr); page = virt_to_page((void *)pte); - WARN_ON(atomic_read(&page->_count) != 1); + WARN_ON(page_count(page) != 1); pte_free_kernel(NULL, pte); + + put_page(pmd_page); } pmd++; } @@ -247,6 +254,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) pud_t *pud; pmd_t *pmd; unsigned long long i, addr; + struct page *page, *pud_page; if (kvm->arch.pgd == NULL) return; @@ -260,6 +268,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) addr = i * (unsigned long long)PGDIR_SIZE; pgd = kvm->arch.pgd + i; pud = pud_offset(pgd, addr); + pud_page = virt_to_page(pud); if (pud_none(*pud)) continue; @@ -268,14 +277,60 @@ void kvm_free_stage2_pgd(struct kvm *kvm) pmd = pmd_offset(pud, addr); free_stage2_ptes(pmd, addr); + page = virt_to_page((void *)pmd); + WARN_ON(page_count(page) != 1); pmd_free(NULL, pmd); + put_page(pud_page); } + WARN_ON(page_count(pud_page) != 1); free_pages((unsigned long)kvm->arch.pgd, PGD2_ORDER); kvm->arch.pgd = NULL; } -static const pte_t null_pte; +static int stage2_clear_pte(struct kvm *kvm, phys_addr_t addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + struct page *page; + + pgd = kvm->arch.pgd + pgd_index(addr); + pud = pud_offset(pgd, addr); + BUG_ON(pud_none(*pud)); + + pmd = pmd_offset(pud, addr); + BUG_ON(pmd_none(*pmd)); + + pte = pte_offset_kernel(pmd, addr); + set_pte_ext(pte, __pte(0), 0); + + page = virt_to_page(pte); + put_page(page); + if (page_count(page) != 1) + return 1; /* TLB invalidate required */ + + /* Need to remove pte page */ + pmd_clear(pmd); + __kvm_tlb_flush_vmid(kvm); + pte_free_kernel(NULL, (pte_t *)((unsigned long)pte & PAGE_MASK)); + + page = virt_to_page(pmd); + put_page(page); + if (page_count(page) != 1) + return 0; /* TLB already invalidated */ + + /* Need to remove pmd page */ + pud_clear(pud); + __kvm_tlb_flush_vmid(kvm); + pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK)); + + page = virt_to_page(pud); + put_page(page); + + return 0; /* TLB already invalidated */ +} static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pte) { @@ -288,7 +343,6 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pt pgd = kvm->arch.pgd + pgd_index(addr); pud = pud_offset(pgd, addr); if (pud_none(*pud)) { - BUG_ON(new_pte == &null_pte); pmd = pmd_alloc_one(NULL, addr); if (!pmd) { kvm_err("Cannot allocate 2nd stage pmd\n"); @@ -296,12 +350,12 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pt } pud_populate(NULL, pud, pmd); pmd += pmd_index(addr); + get_page(virt_to_page(pud)); } else pmd = pmd_offset(pud, addr); /* Create 2nd stage page table mapping - Level 2 */ if (pmd_none(*pmd)) { - BUG_ON(new_pte == &null_pte); pte = pte_alloc_one_kernel(NULL, addr); if (!pte) { kvm_err("Cannot allocate 2nd stage pte\n"); @@ -309,11 +363,13 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pt } pmd_populate_kernel(NULL, pmd, pte); pte += pte_index(addr); + get_page(virt_to_page(pmd)); } else pte = pte_offset_kernel(pmd, addr); /* Create 2nd stage page table mapping - Level 3 */ set_pte_ext(pte, *new_pte, 0); + get_page(virt_to_page(pte)); return 0; } @@ -567,9 +623,9 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) gpa_t gpa_offset = hva - start; gpa_t gpa = (memslot->base_gfn << PAGE_SHIFT) + gpa_offset; - stage2_set_pte(kvm, gpa, &null_pte); - needs_stage2_flush = 1; + needs_stage2_flush = stage2_clear_pte(kvm, gpa); } + } if (needs_stage2_flush) -- 1.7.7.1