Re: [PATCH 2/4] ARM: KVM: add page accounting and guest page table eviction

Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> · Tue, 31 Jul 2012 16:44:01 +0200

On Thu, Jul 5, 2012 at 5:04 PM, Marc Zyngier <marc.zyngier@xxxxxxx> wrote:
> Until now, we do not track how many references the page tables
> contain, which makes them impossible to free from the MMU notifiers.
>
> By updating the page table pages' refcounts each time we fault in
> or evict a page, it becomes easy to track the state of the various
> page tables, and to free them when they become empty.
>
> Only the level-1 PGD is never freed.
>
> Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx>
> ---
>  arch/arm/kvm/mmu.c |   86 ++++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 76 insertions(+), 10 deletions(-)
>
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 4b174e6..33d34ea 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -200,12 +200,15 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
>  static void free_guest_pages(pte_t *pte, unsigned long addr)
>  {
>         unsigned int i;
> -       struct page *page;
> +       struct page *page, *pte_page;
> +
> +       pte_page = virt_to_page(pte);
>
>         for (i = 0; i < PTRS_PER_PTE; i++) {
>                 if (pte_present(*pte)) {
>                         page = pfn_to_page(pte_pfn(*pte));
>                         put_page(page);
> +                       put_page(pte_page);
>                 }
>                 pte++;
>         }
> @@ -215,7 +218,9 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr)
>  {
>         unsigned int i;
>         pte_t *pte;
> -       struct page *page;
> +       struct page *page, *pmd_page;
> +
> +       pmd_page = virt_to_page(pmd);
>
>         for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) {
>                 BUG_ON(pmd_sect(*pmd));
> @@ -223,8 +228,10 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr)
>                         pte = pte_offset_kernel(pmd, addr);
>                         free_guest_pages(pte, addr);
>                         page = virt_to_page((void *)pte);
> -                       WARN_ON(atomic_read(&page->_count) != 1);
> +                       WARN_ON(page_count(page) != 1);

while you're add it, why not move this check to free_guest_pages and
avoid the page variable all together here?

>                         pte_free_kernel(NULL, pte);
> +
> +                       put_page(pmd_page);
>                 }
>                 pmd++;
>         }
> @@ -247,6 +254,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
>         pud_t *pud;
>         pmd_t *pmd;
>         unsigned long long i, addr;
> +       struct page *page, *pud_page;
>
>         if (kvm->arch.pgd == NULL)
>                 return;
> @@ -260,6 +268,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
>                 addr = i * (unsigned long long)PGDIR_SIZE;
>                 pgd = kvm->arch.pgd + i;
>                 pud = pud_offset(pgd, addr);
> +               pud_page = virt_to_page(pud);
>
>                 if (pud_none(*pud))
>                         continue;
> @@ -268,14 +277,70 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
>
>                 pmd = pmd_offset(pud, addr);
>                 free_stage2_ptes(pmd, addr);
> +               page = virt_to_page((void *)pmd);
> +               WARN_ON(page_count(page) != 1);
>                 pmd_free(NULL, pmd);
> +               put_page(pud_page);
>         }
>
> +       WARN_ON(page_count(pud_page) != 1);
>         free_pages((unsigned long)kvm->arch.pgd, PGD2_ORDER);
>         kvm->arch.pgd = NULL;
>  }
>
> -static const pte_t null_pte;
> +/*
> + * Clear a stage-2 PTE, lowering the various ref-counts. Also takes
> + * care of invalidating the TLBs.  Must be called while holding
> + * pgd_lock, otherwise another faulting VCPU may come in and mess
> + * things behind our back.
> + */

you could consider using the kdoc format.

> +static void stage2_clear_pte(struct kvm *kvm, phys_addr_t addr)
> +{
> +       pgd_t *pgd;
> +       pud_t *pud;
> +       pmd_t *pmd;
> +       pte_t *pte;
> +       struct page *page;
> +
> +       pr_info("Clearing PTE&%08llx\n", addr);

is this still in debug stage, should it be a trace event, or should it just go?

also, there are the nice kvm_XXX print macros ;)

> +       pgd = kvm->arch.pgd + pgd_index(addr);
> +       pud = pud_offset(pgd, addr);
> +       BUG_ON(pud_none(*pud));
> +
> +       pmd = pmd_offset(pud, addr);
> +       BUG_ON(pmd_none(*pmd));
> +
> +       pte = pte_offset_kernel(pmd, addr);
> +       set_pte_ext(pte, __pte(0), 0);
> +
> +       page = virt_to_page(pte);
> +       put_page(page);
> +       if (page_count(page) != 1) {
> +               __kvm_tlb_flush_vmid(kvm);
> +               return;
> +       }
> +
> +       /* Need to remove pte page */
> +       pmd_clear(pmd);
> +       __kvm_tlb_flush_vmid(kvm);
> +       pte_free_kernel(NULL, (pte_t *)((unsigned long)pte & PAGE_MASK));
> +
> +       page = virt_to_page(pmd);
> +       put_page(page);
> +       if (page_count(page) != 1)
> +               return;
> +
> +       /*
> +        * Need to remove pmd page. This is the worse case, and we end

s/worse/worst/

> +        * up invalidating TLB twice. Should we care, really?

s/invalidating TLB/invalidating the TLB/

> +        */

not sure for performance, but for readability this is weird, imho.

why not just have an out_invalidate label and replace the return's
with a goto. Standard practice, no?

> +       pud_clear(pud);
> +       __kvm_tlb_flush_vmid(kvm);
> +       pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK));
> +
> +       page = virt_to_page(pud);
> +       put_page(page);
> +}
>
>  static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr,
>                           const pte_t *new_pte)
> @@ -289,7 +354,6 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr,
>         pgd = kvm->arch.pgd + pgd_index(addr);
>         pud = pud_offset(pgd, addr);
>         if (pud_none(*pud)) {
> -               BUG_ON(new_pte == &null_pte);
>                 pmd = pmd_alloc_one(NULL, addr);
>                 if (!pmd) {
>                         kvm_err("Cannot allocate 2nd stage pmd\n");
> @@ -297,12 +361,12 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr,
>                 }
>                 pud_populate(NULL, pud, pmd);
>                 pmd += pmd_index(addr);
> +               get_page(virt_to_page(pud));
>         } else
>                 pmd = pmd_offset(pud, addr);
>
>         /* Create 2nd stage page table mapping - Level 2 */
>         if (pmd_none(*pmd)) {
> -               BUG_ON(new_pte == &null_pte);
>                 pte = pte_alloc_one_kernel(NULL, addr);
>                 if (!pte) {
>                         kvm_err("Cannot allocate 2nd stage pte\n");
> @@ -310,11 +374,14 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr,
>                 }
>                 pmd_populate_kernel(NULL, pmd, pte);
>                 pte += pte_index(addr);
> +               get_page(virt_to_page(pmd));
>         } else
>                 pte = pte_offset_kernel(pmd, addr);
>
>         /* Create 2nd stage page table mapping - Level 3 */
> +       BUG_ON(pte_none(pte));
>         set_pte_ext(pte, *new_pte, 0);
> +       get_page(virt_to_page(pte));
>
>         return 0;
>  }
> @@ -597,6 +664,7 @@ static bool hva_to_gpa(struct kvm *kvm, unsigned long hva, gpa_t *gpa)
>                         /* no overlapping memslots allowed: break */
>                         break;
>                 }
> +

stray whitespace?

>         }
>
>         mutex_unlock(&kvm->slots_lock);
> @@ -613,10 +681,8 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
>
>         spin_lock(&kvm->arch.pgd_lock);
>         found = hva_to_gpa(kvm, hva, &gpa);
> -       if (found) {
> -               stage2_set_pte(kvm, gpa, &null_pte);
> -               __kvm_tlb_flush_vmid(kvm);
> -       }
> +       if (found)
> +               stage2_clear_pte(kvm, gpa);
>         spin_unlock(&kvm->arch.pgd_lock);
>         return 0;
>  }
> --
> 1.7.10.3
>
>
_______________________________________________
kvmarm mailing list
kvmarm@xxxxxxxxxxxxxxxxxxxxx
https://lists.cs.columbia.edu/cucslists/listinfo/kvmarm