On Tue, Sep 04, 2018 at 06:16:01PM +1000, Nicholas Piggin wrote: > THP paths can defer splitting compound pages until after the actual > remap and TLB flushes to split a huge PMD/PUD. This causes radix > partition scope page table mappings to get out of synch with the host > qemu page table mappings. > > This results in random memory corruption in the guest when running > with THP. The easiest way to reproduce is use KVM baloon to free up > a lot of memory in the guest and then shrink the balloon to give the > memory back, while some work is being done in the guest. > > Cc: Paul Mackerras <paulus@xxxxxxxxxx> > Cc: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > Cc: "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> > Cc: linuxppc-dev@xxxxxxxxxxxxxxxx > Signed-off-by: Nicholas Piggin <npiggin@xxxxxxxxx> Seems to fix the problem on my test case. Tested-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > --- > arch/powerpc/kvm/book3s_64_mmu_radix.c | 88 ++++++++++---------------- > 1 file changed, 34 insertions(+), 54 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c > index 0af1c0aea1fe..d8792445d95a 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c > @@ -525,8 +525,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, > unsigned long ea, unsigned long dsisr) > { > struct kvm *kvm = vcpu->kvm; > - unsigned long mmu_seq, pte_size; > - unsigned long gpa, gfn, hva, pfn; > + unsigned long mmu_seq; > + unsigned long gpa, gfn, hva; > struct kvm_memory_slot *memslot; > struct page *page = NULL; > long ret; > @@ -623,9 +623,10 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, > */ > hva = gfn_to_hva_memslot(memslot, gfn); > if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { > - pfn = page_to_pfn(page); > upgrade_write = true; > } else { > + unsigned long pfn; > + > /* Call KVM generic code to do the slow-path check */ > pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, > writing, upgrade_p); > @@ -639,63 +640,42 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, > } > } > > - /* See if we can insert a 1GB or 2MB large PTE here */ > - level = 0; > - if (page && PageCompound(page)) { > - pte_size = PAGE_SIZE << compound_order(compound_head(page)); > - if (pte_size >= PUD_SIZE && > - (gpa & (PUD_SIZE - PAGE_SIZE)) == > - (hva & (PUD_SIZE - PAGE_SIZE))) { > - level = 2; > - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); > - } else if (pte_size >= PMD_SIZE && > - (gpa & (PMD_SIZE - PAGE_SIZE)) == > - (hva & (PMD_SIZE - PAGE_SIZE))) { > - level = 1; > - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); > - } > - } > - > /* > - * Compute the PTE value that we need to insert. > + * Read the PTE from the process' radix tree and use that > + * so we get the shift and attribute bits. > */ > - if (page) { > - pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | > - _PAGE_ACCESSED; > - if (writing || upgrade_write) > - pgflags |= _PAGE_WRITE | _PAGE_DIRTY; > - pte = pfn_pte(pfn, __pgprot(pgflags)); > + local_irq_disable(); > + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); > + pte = *ptep; > + local_irq_enable(); > + > + /* Get pte level from shift/size */ > + if (shift == PUD_SHIFT && > + (gpa & (PUD_SIZE - PAGE_SIZE)) == > + (hva & (PUD_SIZE - PAGE_SIZE))) { > + level = 2; > + } else if (shift == PMD_SHIFT && > + (gpa & (PMD_SIZE - PAGE_SIZE)) == > + (hva & (PMD_SIZE - PAGE_SIZE))) { > + level = 1; > } else { > - /* > - * Read the PTE from the process' radix tree and use that > - * so we get the attribute bits. > - */ > - local_irq_disable(); > - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); > - pte = *ptep; > - local_irq_enable(); > - if (shift == PUD_SHIFT && > - (gpa & (PUD_SIZE - PAGE_SIZE)) == > - (hva & (PUD_SIZE - PAGE_SIZE))) { > - level = 2; > - } else if (shift == PMD_SHIFT && > - (gpa & (PMD_SIZE - PAGE_SIZE)) == > - (hva & (PMD_SIZE - PAGE_SIZE))) { > - level = 1; > - } else if (shift && shift != PAGE_SHIFT) { > - /* Adjust PFN */ > - unsigned long mask = (1ul << shift) - PAGE_SIZE; > - pte = __pte(pte_val(pte) | (hva & mask)); > - } > - pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); > - if (writing || upgrade_write) { > - if (pte_val(pte) & _PAGE_WRITE) > - pte = __pte(pte_val(pte) | _PAGE_DIRTY); > - } else { > - pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); > + level = 0; > + > + /* Can not cope with unknown page shift */ > + if (shift && shift != PAGE_SHIFT) { > + WARN_ON_ONCE(1); > + return -EFAULT; > } > } > > + pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); > + if (writing || upgrade_write) { > + if (pte_val(pte) & _PAGE_WRITE) > + pte = __pte(pte_val(pte) | _PAGE_DIRTY); > + } else { > + pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); > + } > + > /* Allocate space in the tree and write the PTE */ > ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature