Paul Mackerras <paulus@xxxxxxxxxx> writes: > This fixes several bugs in the radix page fault handler relating to > the way large pages in the memory backing the guest were handled. > First, the check for large pages only checked for explicit huge pages > and missed transparent huge pages. Then the check that the addresses > (host virtual vs. guest physical) had appropriate alignment was > wrong, meaning that the code never put a large page in the partition > scoped radix tree; it was always demoted to a small page. > > Fixing this exposed bugs in kvmppc_create_pte(). We were never > invalidating a 2MB PTE, which meant that if a page was initially > faulted in without write permission and the guest then attempted > to store to it, we would never update the PTE to have write permission. > If we find a valid 2MB PTE in the PMD, we need to clear it and > do a TLB invalidation before installing either the new 2MB PTE or > a pointer to a page table page. > > This also corrects an assumption that get_user_pages_fast would set > the _PAGE_DIRTY bit if we are writing, which is not true. Instead we > mark the page dirty explicitly with set_page_dirty_lock(). This > also means we don't need the dirty bit set on the host PTE when > providing write access on a read fault. > > Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx> > --- > arch/powerpc/kvm/book3s_64_mmu_radix.c | 70 +++++++++++++++++++++------------- > 1 file changed, 44 insertions(+), 26 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c > index 0c85481..5798c2c 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c > @@ -195,6 +195,12 @@ static void kvmppc_pte_free(pte_t *ptep) > kmem_cache_free(kvm_pte_cache, ptep); > } > > +/* Like pmd_huge() and pmd_large(), but works regardless of config options */ > +static inline int pmd_is_leaf(pmd_t pmd) > +{ > + return !!(pmd_val(pmd) & _PAGE_PTE); > +} > + > static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, > unsigned int level, unsigned long mmu_seq) > { > @@ -219,7 +225,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, > else > new_pmd = pmd_alloc_one(kvm->mm, gpa); > > - if (level == 0 && !(pmd && pmd_present(*pmd))) > + if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) > new_ptep = kvmppc_pte_alloc(); > > /* Check if we might have been invalidated; let the guest retry if so */ > @@ -244,12 +250,31 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, > new_pmd = NULL; > } > pmd = pmd_offset(pud, gpa); > - if (pmd_large(*pmd)) { > - /* Someone else has instantiated a large page here; retry */ > - ret = -EAGAIN; > - goto out_unlock; > - } > - if (level == 1 && !pmd_none(*pmd)) { > + if (pmd_is_leaf(*pmd)) { > + unsigned long lgpa = gpa & PMD_MASK; > + > + /* > + * If we raced with another CPU which has just put > + * a 2MB pte in after we saw a pte page, try again. > + */ > + if (level == 0 && !new_ptep) { > + ret = -EAGAIN; > + goto out_unlock; > + } > + /* Valid 2MB page here already, remove it */ > + old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), > + _PAGE_PRESENT, 0, lgpa, PMD_SHIFT); Can we do s/_PAGE_PRESENT/~0UL/ and that way avoid pmd_clear(pmd) below? > + kvmppc_radix_tlbie_page(kvm, lgpa, PMD_SHIFT); > + if (old & _PAGE_DIRTY) { > + unsigned long gfn = lgpa >> PAGE_SHIFT; > + struct kvm_memory_slot *memslot; > + memslot = gfn_to_memslot(kvm, gfn); > + if (memslot && memslot->dirty_bitmap) > + kvmppc_update_dirty_map(memslot, > + gfn, PMD_SIZE); > + } > + pmd_clear(pmd); > + } else if (level == 1 && !pmd_none(*pmd)) { > /* > * There's a page table page here, but we wanted > * to install a large page. Tell the caller and let > @@ -412,28 +437,24 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, > } else { > page = pages[0]; > pfn = page_to_pfn(page); > - if (PageHuge(page)) { > - page = compound_head(page); > - pte_size <<= compound_order(page); > + if (PageCompound(page)) { > + pte_size <<= compound_order(compound_head(page)); > /* See if we can insert a 2MB large-page PTE here */ > if (pte_size >= PMD_SIZE && > - (gpa & PMD_MASK & PAGE_MASK) == > - (hva & PMD_MASK & PAGE_MASK)) { > + (gpa & (PMD_SIZE - PAGE_SIZE)) == > + (hva & (PMD_SIZE - PAGE_SIZE))) { > level = 1; > pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); > } > } > /* See if we can provide write access */ > if (writing) { > - /* > - * We assume gup_fast has set dirty on the host PTE. > - */ > pgflags |= _PAGE_WRITE; > } else { > local_irq_save(flags); > ptep = find_current_mm_pte(current->mm->pgd, > hva, NULL, NULL); > - if (ptep && pte_write(*ptep) && pte_dirty(*ptep)) > + if (ptep && pte_write(*ptep)) > pgflags |= _PAGE_WRITE; > local_irq_restore(flags); > } > @@ -459,18 +480,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, > pte = pfn_pte(pfn, __pgprot(pgflags)); > ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); > } > - if (ret == 0 || ret == -EAGAIN) > - ret = RESUME_GUEST; > > if (page) { > - /* > - * We drop pages[0] here, not page because page might > - * have been set to the head page of a compound, but > - * we have to drop the reference on the correct tail > - * page to match the get inside gup() > - */ > - put_page(pages[0]); > + if (!ret && (pgflags & _PAGE_WRITE)) > + set_page_dirty_lock(page); > + put_page(page); > } > + > + if (ret == 0 || ret == -EAGAIN) > + ret = RESUME_GUEST; > return ret; > } > > @@ -644,7 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm) > continue; > pmd = pmd_offset(pud, 0); > for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) { > - if (pmd_huge(*pmd)) { > + if (pmd_is_leaf(*pmd)) { > pmd_clear(pmd); > continue; > } > -- > 2.7.4 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html