Re: [PATCH 2/3] migration dirtybitmap support ARMv7

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 15/04/14 02:24, Mario Smarduch wrote:
> 
> - Support write protection of entire VM address space
> - Split pmds section in migration mode
> - Write protect dirty pages on Dirty log read 
> 
> Signed-off-by: Mario Smarduch <m.smarduch@xxxxxxxxxxx>
> ---
>  arch/arm/kvm/mmu.c |  158 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 156 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 7789857..502e776 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -56,6 +56,13 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>  }
>  
> +void kvm_tlb_flush_vmid(struct kvm *kvm)
> +{
> +	phys_addr_t x;
> +	/* based on function description 2nd argument is irrelevent */
> +	kvm_tlb_flush_vmid_ipa(kvm, x);
> +}

We don't declare a variable just for the sake of having one. The address
passed to kvm_tlb_flush_vmid_ipa() serves a purpose, and while ARMv7
doesn't have a way to invalidate by IPA, you cannot violate the internal
API by passing junk to it. Think of the children!! (ARMv8 in this case) ;-)

If you need the invalidation of a full VMID, then introduce the proper
API at HYP level.

>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>  				  int min, int max)
>  {
> @@ -639,6 +646,143 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
>  	return false;
>  }
>  
> +/*
> + * Called when QEMU retrieves the dirty log and write protects dirty pages
> + * for next QEMU call to retrieve the dirty logn

You really need to explain a few things here. What is this trying to do?
What is mask? Where is it called from? What are the side effects?
Locking conditions?

> + */
> +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
> +		struct kvm_memory_slot *slot,
> +		gfn_t gfn_offset, unsigned long mask)
> +{
> +	phys_addr_t ipa;
> +	pgd_t *pgdp = kvm->arch.pgd, *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte, new_pte;
> +
> +	while (mask) {
> +		ipa = (slot->base_gfn + gfn_offset + __ffs(mask)) << PAGE_SHIFT;
> +		pgd = pgdp + pgd_index(ipa);
> +		if (!pgd_present(*pgd))
> +			goto update_mask;
> +		pud = pud_offset(pgd, ipa);
> +		if (!pud_present(*pud))
> +			goto update_mask;
> +		pmd = pmd_offset(pud, ipa);
> +		if (!pmd_present(*pmd))
> +			goto update_mask;
> +		pte = pte_offset_kernel(pmd, ipa);
> +		if (!pte_present(*pte))
> +			goto update_mask;
> +		if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
> +			goto update_mask;
> +		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
> +		*pte = new_pte;
> +update_mask:
> +		mask &= mask - 1;
> +	}
> +}
> +
> +/*
> + * In migration splits PMDs into PTEs to keep track of dirty pages. Without
> + * spliting light execution prevents migration.

Same as above. We really need to understand the context, the conditions,
and how you make sure this doesn't race against the rest of the VM.

> + */
> +bool split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr)
> +{
> +	struct page *page;
> +	pfn_t pfn = pmd_pfn(*pmd);
> +	pte_t *pte, new_pte;
> +	int i;
> +
> +	page = alloc_page(GFP_KERNEL);
> +	if (page == NULL)
> +		return false;
> +
> +	pte = page_address(page);
> +	for (i = 0; i < PMD_SIZE/PAGE_SIZE; i++) {

Use PTRS_PER_PMD

> +		new_pte = pfn_pte(pfn+i, PAGE_S2);

Missing spaces.

> +		pte[i] = new_pte;
> +	}
> +	kvm_clean_pte(pte);
> +	pmd_populate_kernel(NULL, pmd, pte);
> +
> +	/*
> +	* flush the whole TLB for VM  relying on hardware broadcast
> +	*/
> +	kvm_tlb_flush_vmid(kvm);

Why do you nuke the whole TLBs for this VM? I assume you're going to
repeatedly call this for all the huge pages, aren't you? Can you delay
this flush to do it only once?

> +	get_page(virt_to_page(pte));
> +	return true;
> +}
> +
> +/*
> + * Called from QEMU when migration dirty logging is started. Write the protect
> + * current set. Future faults writes are tracked through WP of when dirty log
> + * log.

Same as above.

> + */
> +
> +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
> +{
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte, new_pte;
> +	pgd_t *pgdp = kvm->arch.pgd;
> +	struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
> +	u64 start = memslot->base_gfn << PAGE_SHIFT;
> +	u64 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
> +	u64 addr = start, addr1;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	kvm->arch.migration_in_progress = true;
> +	while (addr < end) {
> +		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
> +			kvm_tlb_flush_vmid(kvm);

Looks like you're extremely flush happy. If you're holding the lock, why
do you need all the extra flushes in the previous function?

> +			cond_resched_lock(&kvm->mmu_lock);
> +		}
> +
> +		pgd = pgdp + pgd_index(addr);
> +		if (!pgd_present(*pgd)) {
> +			addr = pgd_addr_end(addr, end);
> +			continue;
> +		}
> +
> +		pud = pud_offset(pgd, addr);
> +		if (pud_huge(*pud) || !pud_present(*pud)) {
> +			addr = pud_addr_end(addr, end);
> +			continue;
> +		}
> +
> +		pmd = pmd_offset(pud, addr);
> +		if (!pmd_present(*pmd)) {
> +			addr = pmd_addr_end(addr, end);
> +			continue;
> +		}
> +
> +		if (kvm_pmd_huge(*pmd)) {
> +			if (!split_pmd(kvm, pmd, addr)) {
> +				kvm->arch.migration_in_progress = false;
> +				return;

Bang, you're dead.

> +			}
> +			addr = pmd_addr_end(addr, end);
> +			continue;
> +		}
> +
> +		pte = pte_offset_kernel(pmd, addr);
> +		addr1 = addr;

What is addr1 used for?

> +		addr += PAGE_SIZE;
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
> +			continue;
> +
> +		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
> +		*pte = new_pte;
> +	}
> +	kvm_tlb_flush_vmid(kvm);

OK. So it looks to me that only *this* flush and the one just before the
cond_resched are useful. What am I missing?

> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_memory_slot *memslot,
>  			  unsigned long fault_status)
> @@ -652,6 +796,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>  	struct vm_area_struct *vma;
>  	pfn_t pfn;
> +	bool migration_active = vcpu->kvm->arch.migration_in_progress;

The flag may not be observable until you've actually taken the mmu_lock.

>  	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -705,10 +850,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
>  		goto out_unlock;
> -	if (!hugetlb && !force_pte)
> +	/* During migration don't rebuild huge pages */
> +	if (!hugetlb && !force_pte && !migration_active)
>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>  
> -	if (hugetlb) {
> +	if (!migration_active && hugetlb) {
>  		pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
>  		new_pmd = pmd_mkhuge(new_pmd);
>  		if (writable) {
> @@ -720,6 +866,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	} else {
>  		pte_t new_pte = pfn_pte(pfn, PAGE_S2);
>  		if (writable) {
> +			if (migration_active && hugetlb) {
> +				/* get back pfn from fault_ipa */
> +				pfn += (fault_ipa >> PAGE_SHIFT) &
> +					((1 << (PMD_SHIFT - PAGE_SHIFT))-1);
> +				new_pte = pfn_pte(pfn, PAGE_S2);

Please explain this.

> +			}
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
>  		}
> @@ -727,6 +879,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
>  	}
>  
> +	if (writable)
> +		mark_page_dirty(kvm, gfn);
>  
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
> 

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux