Re: [PATCH] KVM: PPC: Book3S HV: Fix handling of large pages in radix page fault handler

"Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx> · Mon, 26 Feb 2018 12:34:25 +0530

Paul Mackerras <paulus@xxxxxxxxxx> writes:

> This fixes several bugs in the radix page fault handler relating to
> the way large pages in the memory backing the guest were handled.
> First, the check for large pages only checked for explicit huge pages
> and missed transparent huge pages.  Then the check that the addresses
> (host virtual vs. guest physical) had appropriate alignment was
> wrong, meaning that the code never put a large page in the partition
> scoped radix tree; it was always demoted to a small page.
>
> Fixing this exposed bugs in kvmppc_create_pte().  We were never
> invalidating a 2MB PTE, which meant that if a page was initially
> faulted in without write permission and the guest then attempted
> to store to it, we would never update the PTE to have write permission.
> If we find a valid 2MB PTE in the PMD, we need to clear it and
> do a TLB invalidation before installing either the new 2MB PTE or
> a pointer to a page table page.
>
> This also corrects an assumption that get_user_pages_fast would set
> the _PAGE_DIRTY bit if we are writing, which is not true.  Instead we
> mark the page dirty explicitly with set_page_dirty_lock().  This
> also means we don't need the dirty bit set on the host PTE when
> providing write access on a read fault.
>
> Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx>
> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 70 +++++++++++++++++++++-------------
>  1 file changed, 44 insertions(+), 26 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 0c85481..5798c2c 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -195,6 +195,12 @@ static void kvmppc_pte_free(pte_t *ptep)
>  	kmem_cache_free(kvm_pte_cache, ptep);
>  }
>  
> +/* Like pmd_huge() and pmd_large(), but works regardless of config options */
> +static inline int pmd_is_leaf(pmd_t pmd)
> +{
> +	return !!(pmd_val(pmd) & _PAGE_PTE);
> +}
> +
>  static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
>  			     unsigned int level, unsigned long mmu_seq)
>  {
> @@ -219,7 +225,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
>  	else
>  		new_pmd = pmd_alloc_one(kvm->mm, gpa);
>  
> -	if (level == 0 && !(pmd && pmd_present(*pmd)))
> +	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
>  		new_ptep = kvmppc_pte_alloc();
>  
>  	/* Check if we might have been invalidated; let the guest retry if so */
> @@ -244,12 +250,31 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
>  		new_pmd = NULL;
>  	}
>  	pmd = pmd_offset(pud, gpa);
> -	if (pmd_large(*pmd)) {
> -		/* Someone else has instantiated a large page here; retry */
> -		ret = -EAGAIN;
> -		goto out_unlock;
> -	}
> -	if (level == 1 && !pmd_none(*pmd)) {
> +	if (pmd_is_leaf(*pmd)) {
> +		unsigned long lgpa = gpa & PMD_MASK;
> +
> +		/*
> +		 * If we raced with another CPU which has just put
> +		 * a 2MB pte in after we saw a pte page, try again.
> +		 */
> +		if (level == 0 && !new_ptep) {
> +			ret = -EAGAIN;
> +			goto out_unlock;
> +		}
> +		/* Valid 2MB page here already, remove it */
> +		old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
> +					_PAGE_PRESENT, 0, lgpa, PMD_SHIFT);

Can we do s/_PAGE_PRESENT/~0UL/  and that way avoid pmd_clear(pmd) below?

> +		kvmppc_radix_tlbie_page(kvm, lgpa, PMD_SHIFT);
> +		if (old & _PAGE_DIRTY) {
> +			unsigned long gfn = lgpa >> PAGE_SHIFT;
> +			struct kvm_memory_slot *memslot;
> +			memslot = gfn_to_memslot(kvm, gfn);
> +			if (memslot && memslot->dirty_bitmap)
> +				kvmppc_update_dirty_map(memslot,
> +							gfn, PMD_SIZE);
> +		}
> +		pmd_clear(pmd);
> +	} else if (level == 1 && !pmd_none(*pmd)) {
>  		/*
>  		 * There's a page table page here, but we wanted
>  		 * to install a large page.  Tell the caller and let
> @@ -412,28 +437,24 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  	} else {
>  		page = pages[0];
>  		pfn = page_to_pfn(page);
> -		if (PageHuge(page)) {
> -			page = compound_head(page);
> -			pte_size <<= compound_order(page);
> +		if (PageCompound(page)) {
> +			pte_size <<= compound_order(compound_head(page));
>  			/* See if we can insert a 2MB large-page PTE here */
>  			if (pte_size >= PMD_SIZE &&
> -			    (gpa & PMD_MASK & PAGE_MASK) ==
> -			    (hva & PMD_MASK & PAGE_MASK)) {
> +			    (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> +			    (hva & (PMD_SIZE - PAGE_SIZE))) {
>  				level = 1;
>  				pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
>  			}
>  		}
>  		/* See if we can provide write access */
>  		if (writing) {
> -			/*
> -			 * We assume gup_fast has set dirty on the host PTE.
> -			 */
>  			pgflags |= _PAGE_WRITE;
>  		} else {
>  			local_irq_save(flags);
>  			ptep = find_current_mm_pte(current->mm->pgd,
>  						   hva, NULL, NULL);
> -			if (ptep && pte_write(*ptep) && pte_dirty(*ptep))
> +			if (ptep && pte_write(*ptep))
>  				pgflags |= _PAGE_WRITE;
>  			local_irq_restore(flags);
>  		}
> @@ -459,18 +480,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  		pte = pfn_pte(pfn, __pgprot(pgflags));
>  		ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
>  	}
> -	if (ret == 0 || ret == -EAGAIN)
> -		ret = RESUME_GUEST;
>  
>  	if (page) {
> -		/*
> -		 * We drop pages[0] here, not page because page might
> -		 * have been set to the head page of a compound, but
> -		 * we have to drop the reference on the correct tail
> -		 * page to match the get inside gup()
> -		 */
> -		put_page(pages[0]);
> +		if (!ret && (pgflags & _PAGE_WRITE))
> +			set_page_dirty_lock(page);
> +		put_page(page);
>  	}
> +
> +	if (ret == 0 || ret == -EAGAIN)
> +		ret = RESUME_GUEST;
>  	return ret;
>  }
>  
> @@ -644,7 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm)
>  				continue;
>  			pmd = pmd_offset(pud, 0);
>  			for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
> -				if (pmd_huge(*pmd)) {
> +				if (pmd_is_leaf(*pmd)) {
>  					pmd_clear(pmd);
>  					continue;
>  				}
> -- 
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html