[Android-virt] [PATCH 1/6] ARM: KVM: add page accounting and guest page table eviction

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, May 14, 2012 at 4:05 PM, Marc Zyngier <marc.zyngier at arm.com> wrote:
> Until now, we do not track how many references the page tables
> contain, which makes them impossible to free from the MMU notifiers.
>
> By updating the page table pages' refcounts each time we fault in
> or evict a page, it becomes easy to track the state of the various
> page tables, and to free them when them become empty.

s/them/they/

>
> Only the top PGD is never freed.

s/top/level-1/ ?

>
> Signed-off-by: Marc Zyngier <marc.zyngier at arm.com>
> ---
> ?arch/arm/kvm/mmu.c | ? 72 ++++++++++++++++++++++++++++++++++++++++++++++------
> ?1 files changed, 64 insertions(+), 8 deletions(-)
>
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 889b2ee..5353ad4 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -199,13 +199,16 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
> ?static void free_guest_pages(pte_t *pte, unsigned long addr)
> ?{
> ? ? ? ?unsigned int i;
> - ? ? ? struct page *page;
> + ? ? ? struct page *page, *pte_page;
> +
> + ? ? ? pte_page = virt_to_page(pte);
>
> ? ? ? ?for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
> ? ? ? ? ? ? ? ?if (!pte_present(*pte))
> ? ? ? ? ? ? ? ? ? ? ? ?goto next_page;
> ? ? ? ? ? ? ? ?page = pfn_to_page(pte_pfn(*pte));
> ? ? ? ? ? ? ? ?put_page(page);
> + ? ? ? ? ? ? ? put_page(pte_page);
> ?next_page:
> ? ? ? ? ? ? ? ?pte++;
> ? ? ? ?}
> @@ -215,7 +218,9 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr)
> ?{
> ? ? ? ?unsigned int i;
> ? ? ? ?pte_t *pte;
> - ? ? ? struct page *page;
> + ? ? ? struct page *page, *pmd_page;
> +
> + ? ? ? pmd_page = virt_to_page(pmd);
>
> ? ? ? ?for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) {
> ? ? ? ? ? ? ? ?BUG_ON(pmd_sect(*pmd));
> @@ -223,8 +228,10 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr)
> ? ? ? ? ? ? ? ? ? ? ? ?pte = pte_offset_kernel(pmd, addr);
> ? ? ? ? ? ? ? ? ? ? ? ?free_guest_pages(pte, addr);
> ? ? ? ? ? ? ? ? ? ? ? ?page = virt_to_page((void *)pte);
> - ? ? ? ? ? ? ? ? ? ? ? WARN_ON(atomic_read(&page->_count) != 1);
> + ? ? ? ? ? ? ? ? ? ? ? WARN_ON(page_count(page) != 1);
> ? ? ? ? ? ? ? ? ? ? ? ?pte_free_kernel(NULL, pte);
> +
> + ? ? ? ? ? ? ? ? ? ? ? put_page(pmd_page);
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?pmd++;
> ? ? ? ?}
> @@ -247,6 +254,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
> ? ? ? ?pud_t *pud;
> ? ? ? ?pmd_t *pmd;
> ? ? ? ?unsigned long long i, addr;
> + ? ? ? struct page *page, *pud_page;
>
> ? ? ? ?if (kvm->arch.pgd == NULL)
> ? ? ? ? ? ? ? ?return;
> @@ -260,6 +268,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
> ? ? ? ? ? ? ? ?addr = i * (unsigned long long)PGDIR_SIZE;
> ? ? ? ? ? ? ? ?pgd = kvm->arch.pgd + i;
> ? ? ? ? ? ? ? ?pud = pud_offset(pgd, addr);
> + ? ? ? ? ? ? ? pud_page = virt_to_page(pud);
>
> ? ? ? ? ? ? ? ?if (pud_none(*pud))
> ? ? ? ? ? ? ? ? ? ? ? ?continue;
> @@ -268,14 +277,60 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
>
> ? ? ? ? ? ? ? ?pmd = pmd_offset(pud, addr);
> ? ? ? ? ? ? ? ?free_stage2_ptes(pmd, addr);
> + ? ? ? ? ? ? ? page = virt_to_page((void *)pmd);
> + ? ? ? ? ? ? ? WARN_ON(page_count(page) != 1);
> ? ? ? ? ? ? ? ?pmd_free(NULL, pmd);
> + ? ? ? ? ? ? ? put_page(pud_page);
> ? ? ? ?}
>
> + ? ? ? WARN_ON(page_count(pud_page) != 1);
> ? ? ? ?free_pages((unsigned long)kvm->arch.pgd, PGD2_ORDER);
> ? ? ? ?kvm->arch.pgd = NULL;
> ?}
>
> -static const pte_t null_pte;
> +static int stage2_clear_pte(struct kvm *kvm, phys_addr_t addr)
> +{
> + ? ? ? pgd_t *pgd;
> + ? ? ? pud_t *pud;
> + ? ? ? pmd_t *pmd;
> + ? ? ? pte_t *pte;
> + ? ? ? struct page *page;
> +
> + ? ? ? pgd = kvm->arch.pgd + pgd_index(addr);
> + ? ? ? pud = pud_offset(pgd, addr);
> + ? ? ? BUG_ON(pud_none(*pud));
> +
> + ? ? ? pmd = pmd_offset(pud, addr);
> + ? ? ? BUG_ON(pmd_none(*pmd));
> +
> + ? ? ? pte = pte_offset_kernel(pmd, addr);
> + ? ? ? set_pte_ext(pte, __pte(0), 0);
> +
> + ? ? ? page = virt_to_page(pte);
> + ? ? ? put_page(page);
> + ? ? ? if (page_count(page) != 1)
> + ? ? ? ? ? ? ? return 1; ? ? ? /* TLB invalidate required */
> +

what's the distinction between returning 1 or 0 here? why not simply
return 1 in all subsequent cases and let the caller free this?

If this is for synchronization reasons and I fail to see it, then
perhaps there should be a comment explaining it?

In fact, what happens if another VCPU takes a page fault while this is
running and page_count(page) hits 1, but before pmd_clear a new page
has been allocated there. Don't we leak memory in this case?

> + ? ? ? /* Need to remove pte page */
> + ? ? ? pmd_clear(pmd);
> + ? ? ? __kvm_tlb_flush_vmid(kvm);
> + ? ? ? pte_free_kernel(NULL, (pte_t *)((unsigned long)pte & PAGE_MASK));
> +
> + ? ? ? page = virt_to_page(pmd);
> + ? ? ? put_page(page);
> + ? ? ? if (page_count(page) != 1)
> + ? ? ? ? ? ? ? return 0; ? ? ? /* TLB already invalidated */
> +
> + ? ? ? /* Need to remove pmd page */
> + ? ? ? pud_clear(pud);
> + ? ? ? __kvm_tlb_flush_vmid(kvm);
> + ? ? ? pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK));
> +
> + ? ? ? page = virt_to_page(pud);
> + ? ? ? put_page(page);
> +
> + ? ? ? return 0; ? ? ? ? ? ? ? /* TLB already invalidated */
> +}
>
> ?static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pte)
> ?{
> @@ -288,7 +343,6 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pt
> ? ? ? ?pgd = kvm->arch.pgd + pgd_index(addr);
> ? ? ? ?pud = pud_offset(pgd, addr);
> ? ? ? ?if (pud_none(*pud)) {
> - ? ? ? ? ? ? ? BUG_ON(new_pte == &null_pte);
> ? ? ? ? ? ? ? ?pmd = pmd_alloc_one(NULL, addr);
> ? ? ? ? ? ? ? ?if (!pmd) {
> ? ? ? ? ? ? ? ? ? ? ? ?kvm_err("Cannot allocate 2nd stage pmd\n");
> @@ -296,12 +350,12 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pt
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?pud_populate(NULL, pud, pmd);
> ? ? ? ? ? ? ? ?pmd += pmd_index(addr);
> + ? ? ? ? ? ? ? get_page(virt_to_page(pud));
> ? ? ? ?} else
> ? ? ? ? ? ? ? ?pmd = pmd_offset(pud, addr);
>
> ? ? ? ?/* Create 2nd stage page table mapping - Level 2 */
> ? ? ? ?if (pmd_none(*pmd)) {
> - ? ? ? ? ? ? ? BUG_ON(new_pte == &null_pte);
> ? ? ? ? ? ? ? ?pte = pte_alloc_one_kernel(NULL, addr);
> ? ? ? ? ? ? ? ?if (!pte) {
> ? ? ? ? ? ? ? ? ? ? ? ?kvm_err("Cannot allocate 2nd stage pte\n");
> @@ -309,11 +363,13 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, const pte_t *new_pt
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?pmd_populate_kernel(NULL, pmd, pte);
> ? ? ? ? ? ? ? ?pte += pte_index(addr);
> + ? ? ? ? ? ? ? get_page(virt_to_page(pmd));
> ? ? ? ?} else
> ? ? ? ? ? ? ? ?pte = pte_offset_kernel(pmd, addr);
>
> ? ? ? ?/* Create 2nd stage page table mapping - Level 3 */
> ? ? ? ?set_pte_ext(pte, *new_pte, 0);
> + ? ? ? get_page(virt_to_page(pte));

don't we need to check if !pte_none(pte) before setting it, and not
call get_page in that case?

seems we should have a BUG_ON or something otherwise.

>
> ? ? ? ?return 0;
> ?}
> @@ -567,9 +623,9 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
> ? ? ? ? ? ? ? ? ? ? ? ?gpa_t gpa_offset = hva - start;
> ? ? ? ? ? ? ? ? ? ? ? ?gpa_t gpa = (memslot->base_gfn << PAGE_SHIFT) + gpa_offset;
>
> - ? ? ? ? ? ? ? ? ? ? ? stage2_set_pte(kvm, gpa, &null_pte);
> - ? ? ? ? ? ? ? ? ? ? ? needs_stage2_flush = 1;
> + ? ? ? ? ? ? ? ? ? ? ? needs_stage2_flush = stage2_clear_pte(kvm, gpa);
> ? ? ? ? ? ? ? ?}
> +
> ? ? ? ?}
>
> ? ? ? ?if (needs_stage2_flush)
> --
> 1.7.7.1



[Index of Archives]     [Linux KVM]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux