Re: [PATCH 5/5 v2] KVM: PPC: Book3S HV: Use __gfn_to_pfn_memslot() in page fault handler

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 28/3/18 8:31 am, Paul Mackerras wrote:
> This changes the hypervisor page fault handler for radix guests to use
> the generic KVM __gfn_to_pfn_memslot() function instead of using
> get_user_pages_fast() and then handling the case of VM_PFNMAP vmas
> specially.  The old code missed the case of VM_IO vmas; with this
> change, VM_IO vmas will now be handled correctly by code within
> __gfn_to_pfn_memslot.
> 
> Currently, __gfn_to_pfn_memslot calls hva_to_pfn, which only uses
> __get_user_pages_fast for the initial lookup in the cases where
> either atomic or async is set.  Since we are not setting either
> atomic or async, we do our own __get_user_pages_fast first, for now.
> 
> This also adds code to check for the KVM_MEM_READONLY flag on the
> memslot.  If it is set and this is a write access, we synthesize a
> data storage interrupt for the guest.
> 
> In the case where the page is not normal RAM (i.e. page == NULL in
> kvmppc_book3s_radix_page_fault(), we read the PTE from the Linux page
> tables because we need the mapping attribute bits as well as the PFN.
> (The mapping attribute bits indicate whether accesses have to be
> non-cacheable and/or guarded.)
> 
> Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx>


Tested-by: Alexey Kardashevskiy <aik@xxxxxxxxx>

I tried this particular version on top of other 4 patches from this set
with P9+XHCI passed through, it works.



> ---
> v2: Read the Linux PTE for the non-RAM case to get the attribute bits.
> 
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 148 ++++++++++++++++++++-------------
>  1 file changed, 88 insertions(+), 60 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 05acc67..0590f16 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -392,11 +392,11 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  	unsigned long mmu_seq, pte_size;
>  	unsigned long gpa, gfn, hva, pfn;
>  	struct kvm_memory_slot *memslot;
> -	struct page *page = NULL, *pages[1];
> -	long ret, npages;
> -	unsigned int writing;
> -	struct vm_area_struct *vma;
> -	unsigned long flags;
> +	struct page *page = NULL;
> +	long ret;
> +	bool writing;
> +	bool upgrade_write = false;
> +	bool *upgrade_p = &upgrade_write;
>  	pte_t pte, *ptep;
>  	unsigned long pgflags;
>  	unsigned int shift, level;
> @@ -436,12 +436,17 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  					      dsisr & DSISR_ISSTORE);
>  	}
>  
> -	/* used to check for invalidations in progress */
> -	mmu_seq = kvm->mmu_notifier_seq;
> -	smp_rmb();
> -
>  	writing = (dsisr & DSISR_ISSTORE) != 0;
> -	hva = gfn_to_hva_memslot(memslot, gfn);
> +	if (memslot->flags & KVM_MEM_READONLY) {
> +		if (writing) {
> +			/* give the guest a DSI */
> +			dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
> +			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
> +			return RESUME_GUEST;
> +		}
> +		upgrade_p = NULL;
> +	}
> +
>  	if (dsisr & DSISR_SET_RC) {
>  		/*
>  		 * Need to set an R or C bit in the 2nd-level tables;
> @@ -470,69 +475,92 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  			return RESUME_GUEST;
>  	}
>  
> -	ret = -EFAULT;
> -	pfn = 0;
> -	pte_size = PAGE_SIZE;
> -	pgflags = _PAGE_READ | _PAGE_EXEC;
> -	level = 0;
> -	npages = get_user_pages_fast(hva, 1, writing, pages);
> -	if (npages < 1) {
> -		/* Check if it's an I/O mapping */
> -		down_read(&current->mm->mmap_sem);
> -		vma = find_vma(current->mm, hva);
> -		if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
> -		    (vma->vm_flags & VM_PFNMAP)) {
> -			pfn = vma->vm_pgoff +
> -				((hva - vma->vm_start) >> PAGE_SHIFT);
> -			pgflags = pgprot_val(vma->vm_page_prot);
> -		}
> -		up_read(&current->mm->mmap_sem);
> -		if (!pfn)
> -			return -EFAULT;
> -	} else {
> -		page = pages[0];
> +	/* used to check for invalidations in progress */
> +	mmu_seq = kvm->mmu_notifier_seq;
> +	smp_rmb();
> +
> +	/*
> +	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
> +	 * do it with !atomic && !async, which is how we call it.
> +	 * We always ask for write permission since the common case
> +	 * is that the page is writable.
> +	 */
> +	hva = gfn_to_hva_memslot(memslot, gfn);
> +	if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
>  		pfn = page_to_pfn(page);
> -		if (PageCompound(page)) {
> -			pte_size <<= compound_order(compound_head(page));
> -			/* See if we can insert a 1GB or 2MB large PTE here */
> -			if (pte_size >= PUD_SIZE &&
> -			    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> -			    (hva & (PUD_SIZE - PAGE_SIZE))) {
> -				level = 2;
> -				pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
> -			} else if (pte_size >= PMD_SIZE &&
> -			    (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> -			    (hva & (PMD_SIZE - PAGE_SIZE))) {
> -				level = 1;
> -				pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
> -			}
> +		upgrade_write = true;
> +	} else {
> +		/* Call KVM generic code to do the slow-path check */
> +		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
> +					   writing, upgrade_p);
> +		if (is_error_noslot_pfn(pfn))
> +			return -EFAULT;
> +		page = NULL;
> +		if (pfn_valid(pfn)) {
> +			page = pfn_to_page(pfn);
> +			if (PageReserved(page))
> +				page = NULL;
>  		}
> -		/* See if we can provide write access */
> -		if (writing) {
> -			pgflags |= _PAGE_WRITE;
> -		} else {
> -			local_irq_save(flags);
> -			ptep = find_current_mm_pte(current->mm->pgd,
> -						   hva, NULL, NULL);
> -			if (ptep && pte_write(*ptep))
> -				pgflags |= _PAGE_WRITE;
> -			local_irq_restore(flags);
> +	}
> +
> +	/* See if we can insert a 1GB or 2MB large PTE here */
> +	level = 0;
> +	if (page && PageCompound(page)) {
> +		pte_size = PAGE_SIZE << compound_order(compound_head(page));
> +		if (pte_size >= PUD_SIZE &&
> +		    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> +		    (hva & (PUD_SIZE - PAGE_SIZE))) {
> +			level = 2;
> +			pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
> +		} else if (pte_size >= PMD_SIZE &&
> +			   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> +			   (hva & (PMD_SIZE - PAGE_SIZE))) {
> +			level = 1;
> +			pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
>  		}
>  	}
>  
>  	/*
>  	 * Compute the PTE value that we need to insert.
>  	 */
> -	pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
> -	if (pgflags & _PAGE_WRITE)
> -		pgflags |= _PAGE_DIRTY;
> -	pte = pfn_pte(pfn, __pgprot(pgflags));
> +	if (page) {
> +		pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
> +			_PAGE_ACCESSED;
> +		if (writing || upgrade_write)
> +			pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
> +		pte = pfn_pte(pfn, __pgprot(pgflags));
> +	} else {
> +		/*
> +		 * Read the PTE from the process' radix tree and use that
> +		 * so we get the attribute bits.
> +		 */
> +		local_irq_disable();
> +		ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
> +		pte = *ptep;
> +		local_irq_enable();
> +		if (shift == PUD_SHIFT &&
> +		    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> +		    (hva & (PUD_SIZE - PAGE_SIZE))) {
> +			level = 2;
> +		} else if (shift == PMD_SHIFT &&
> +			   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> +			   (hva & (PMD_SIZE - PAGE_SIZE))) {
> +			level = 1;
> +		} else if (shift && shift != PAGE_SHIFT) {
> +			/* Adjust PFN */
> +			unsigned long mask = (1ul << shift) - PAGE_SIZE;
> +			pte = __pte(pte_val(pte) | (hva & mask));
> +		}
> +		if (!(writing || upgrade_write))
> +			pte = __pte(pte_val(pte) & ~ _PAGE_WRITE);
> +		pte = __pte(pte_val(pte) | _PAGE_EXEC);
> +	}
>  
>  	/* Allocate space in the tree and write the PTE */
>  	ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
>  
>  	if (page) {
> -		if (!ret && (pgflags & _PAGE_WRITE))
> +		if (!ret && (pte_val(pte) & _PAGE_WRITE))
>  			set_page_dirty_lock(page);
>  		put_page(page);
>  	}
> 


-- 
Alexey



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux