RE: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in kvmppc_mmu_map_page()

Bhushan Bharat-R65777 <R65777@xxxxxxxxxxxxx> · Wed, 7 Aug 2013 05:17:29 +0000

> -----Original Message-----
> From: kvm-owner@xxxxxxxxxxxxxxx [mailto:kvm-owner@xxxxxxxxxxxxxxx] On Behalf Of
> Paul Mackerras
> Sent: Tuesday, August 06, 2013 9:58 AM
> To: Alexander Graf; Benjamin Herrenschmidt
> Cc: kvm-ppc@xxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx
> Subject: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in
> kvmppc_mmu_map_page()
> 
> When the MM code is invalidating a range of pages, it calls the KVM
> kvm_mmu_notifier_invalidate_range_start() notifier function, which calls
> kvm_unmap_hva_range(), which arranges to flush all the existing host
> HPTEs for guest pages.  However, the Linux PTEs for the range being
> flushed are still valid at that point.  We are not supposed to establish
> any new references to pages in the range until the ...range_end()
> notifier gets called.  The PPC-specific KVM code doesn't get any
> explicit notification of that; instead, we are supposed to use
> mmu_notifier_retry() to test whether we are or have been inside a
> range flush notifier pair while we have been getting a page and
> instantiating a host HPTE for the page.
> 
> This therefore adds a call to mmu_notifier_retry inside
> kvmppc_mmu_map_page().  This call is inside a region locked with
> kvm->mmu_lock, which is the same lock that is called by the KVM
> MMU notifier functions, thus ensuring that no new notification can
> proceed while we are in the locked region.  Inside this region we
> also create the host HPTE and link the corresponding hpte_cache
> structure into the lists used to find it later.  We cannot allocate
> the hpte_cache structure inside this locked region because that can
> lead to deadlock, so we allocate it outside the region and free it
> if we end up not using it.
> 
> This also moves the updates of vcpu3s->hpte_cache_count inside the
> regions locked with vcpu3s->mmu_lock, and does the increment in
> kvmppc_mmu_hpte_cache_map() when the pte is added to the cache
> rather than when it is allocated, in order that the hpte_cache_count
> is accurate.
> 
> Signed-off-by: Paul Mackerras <paulus@xxxxxxxxx>
> ---
>  arch/powerpc/include/asm/kvm_book3s.h |  1 +
>  arch/powerpc/kvm/book3s_64_mmu_host.c | 37 ++++++++++++++++++++++++++---------
>  arch/powerpc/kvm/book3s_mmu_hpte.c    | 14 +++++++++----
>  3 files changed, 39 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 4fe6864..e711e77 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -143,6 +143,7 @@ extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t
> eaddr,
> 
>  extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache
> *pte);
>  extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
> +extern void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte);
>  extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu);
>  extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
>  extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache
> *pte);
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c
> b/arch/powerpc/kvm/book3s_64_mmu_host.c
> index 7fcf38f..b7e9504 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_host.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
> @@ -93,6 +93,13 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct
> kvmppc_pte *orig_pte,
>  	int r = 0;
>  	int hpsize = MMU_PAGE_4K;
>  	bool writable;
> +	unsigned long mmu_seq;
> +	struct kvm *kvm = vcpu->kvm;
> +	struct hpte_cache *cpte;
> +
> +	/* used to check for invalidations in progress */
> +	mmu_seq = kvm->mmu_notifier_seq;
> +	smp_rmb();
> 
>  	/* Get host physical address for gpa */
>  	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT,
> @@ -143,6 +150,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct
> kvmppc_pte *orig_pte,
> 
>  	hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M);
> 
> +	cpte = kvmppc_mmu_hpte_cache_next(vcpu);
> +
> +	spin_lock(&kvm->mmu_lock);
> +	if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) {
> +		r = -EAGAIN;

Pauls, I am trying to understand the flow; does retry mean that we do not create the mapping and return to guest, which will fault again and then we will retry? 

Thanks
-Bharat

> +		goto out_unlock;
> +	}
> +
>  map_again:
>  	hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
> 
> @@ -150,7 +165,7 @@ map_again:
>  	if (attempt > 1)
>  		if (ppc_md.hpte_remove(hpteg) < 0) {
>  			r = -1;
> -			goto out;
> +			goto out_unlock;
>  		}
> 
>  	ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags,
> @@ -163,8 +178,6 @@ map_again:
>  		attempt++;
>  		goto map_again;
>  	} else {
> -		struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
> -
>  		trace_kvm_book3s_64_mmu_map(rflags, hpteg,
>  					    vpn, hpaddr, orig_pte);
> 
> @@ -175,15 +188,21 @@ map_again:
>  			hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
>  		}
> 
> -		pte->slot = hpteg + (ret & 7);
> -		pte->host_vpn = vpn;
> -		pte->pte = *orig_pte;
> -		pte->pfn = hpaddr >> PAGE_SHIFT;
> -		pte->pagesize = hpsize;
> +		cpte->slot = hpteg + (ret & 7);
> +		cpte->host_vpn = vpn;
> +		cpte->pte = *orig_pte;
> +		cpte->pfn = hpaddr >> PAGE_SHIFT;
> +		cpte->pagesize = hpsize;
> 
> -		kvmppc_mmu_hpte_cache_map(vcpu, pte);
> +		kvmppc_mmu_hpte_cache_map(vcpu, cpte);
> +		cpte = NULL;
>  	}
> +
> +out_unlock:
> +	spin_unlock(&kvm->mmu_lock);
>  	kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
> +	if (cpte)
> +		kvmppc_mmu_hpte_cache_free(cpte);
> 
>  out:
>  	return r;
> diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c
> b/arch/powerpc/kvm/book3s_mmu_hpte.c
> index d2d280b..6b79bfc 100644
> --- a/arch/powerpc/kvm/book3s_mmu_hpte.c
> +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
> @@ -98,6 +98,8 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct
> hpte_cache *pte)
>  			   &vcpu3s->hpte_hash_vpte_64k[index]);
>  #endif
> 
> +	vcpu3s->hpte_cache_count++;
> +
>  	spin_unlock(&vcpu3s->mmu_lock);
>  }
> 
> @@ -131,10 +133,10 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct
> hpte_cache *pte)
>  #ifdef CONFIG_PPC_BOOK3S_64
>  	hlist_del_init_rcu(&pte->list_vpte_64k);
>  #endif
> +	vcpu3s->hpte_cache_count--;
> 
>  	spin_unlock(&vcpu3s->mmu_lock);
> 
> -	vcpu3s->hpte_cache_count--;
>  	call_rcu(&pte->rcu_head, free_pte_rcu);
>  }
> 
> @@ -331,15 +333,19 @@ struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct
> kvm_vcpu *vcpu)
>  	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
>  	struct hpte_cache *pte;
> 
> -	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
> -	vcpu3s->hpte_cache_count++;
> -
>  	if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM)
>  		kvmppc_mmu_pte_flush_all(vcpu);
> 
> +	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
> +
>  	return pte;
>  }
> 
> +void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte)
> +{
> +	kmem_cache_free(hpte_cache, pte);
> +}
> +
>  void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
>  {
>  	kvmppc_mmu_pte_flush(vcpu, 0, 0);
> --
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html