Re: [PATCH 2/2] kvm: change the dirty page tracking to work with dirty bity

Marcelo Tosatti <mtosatti@xxxxxxxxxx> · Wed, 10 Jun 2009 17:42:27 -0300

On Wed, Jun 10, 2009 at 07:23:25PM +0300, Izik Eidus wrote:
> change the dirty page tracking to work with dirty bity instead of page fault.
> right now the dirty page tracking work with the help of page faults, when we
> want to track a page for being dirty, we write protect it and we mark it dirty
> when we have write page fault, this code move into looking at the dirty bit
> of the spte.
> 
> Signed-off-by: Izik Eidus <ieidus@xxxxxxxxxx>
> ---
>  arch/ia64/kvm/kvm-ia64.c        |    4 +++
>  arch/powerpc/kvm/powerpc.c      |    4 +++
>  arch/s390/kvm/kvm-s390.c        |    4 +++
>  arch/x86/include/asm/kvm_host.h |    3 ++
>  arch/x86/kvm/mmu.c              |   42 ++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/svm.c              |    7 ++++++
>  arch/x86/kvm/vmx.c              |    7 ++++++
>  arch/x86/kvm/x86.c              |   26 ++++++++++++++++++++---
>  include/linux/kvm_host.h        |    1 +
>  virt/kvm/kvm_main.c             |    6 ++++-
>  10 files changed, 96 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
> index 3199221..5914128 100644
> --- a/arch/ia64/kvm/kvm-ia64.c
> +++ b/arch/ia64/kvm/kvm-ia64.c
> @@ -1809,6 +1809,10 @@ void kvm_arch_exit(void)
>  	kvm_vmm_info = NULL;
>  }
>  
> +void kvm_arch_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
> +{
> +}
> +
>  static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
>  		struct kvm_dirty_log *log)
>  {
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 2cf915e..6beb368 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -418,6 +418,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
>  	return -ENOTSUPP;
>  }

>  

#ifndef KVM_ARCH_HAVE_DIRTY_LOG
> +void kvm_arch_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
> +{
> +}
> +
#endif

in virt/kvm/main.c

> index c7b0cc2..8a24149 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -527,6 +527,7 @@ struct kvm_x86_ops {
>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>  	int (*get_tdp_level)(void);
>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> +	int (*dirty_bit_support)(void);
>  };
>  
>  extern struct kvm_x86_ops *kvm_x86_ops;
> @@ -796,4 +797,6 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
>  int kvm_age_hva(struct kvm *kvm, unsigned long hva);
>  int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
>  
> +int is_dirty_and_clean_rmapp(struct kvm *kvm, unsigned long *rmapp);
> +
>  #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 809cce0..500e0e2 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -140,6 +140,8 @@ module_param(oos_shadow, bool, 0644);
>  #define ACC_USER_MASK    PT_USER_MASK
>  #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
>  
> +#define SPTE_DONT_DIRTY (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> +
>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
>  
>  struct kvm_rmap_desc {
> @@ -629,6 +631,29 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
>  	return NULL;
>  }
>  
> +int is_dirty_and_clean_rmapp(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	u64 *spte;
> +	int dirty = 0;
> +
> +	if (!shadow_dirty_mask)
> +		return 0;
> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		if (*spte & PT_DIRTY_MASK) {
> +			set_shadow_pte(spte, (*spte &= ~PT_DIRTY_MASK) |
> +				       SPTE_DONT_DIRTY);
> +			dirty = 1;
> +			break;
> +		}
> +		spte = rmap_next(kvm, rmapp, spte);
> +	}
> +
> +	return dirty;
> +}
> +
> +
>  static int rmap_write_protect(struct kvm *kvm, u64 gfn)
>  {
>  	unsigned long *rmapp;
> @@ -1381,11 +1406,17 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
>  static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
>  	int ret;
> +	int i;
> +
>  	++kvm->stat.mmu_shadow_zapped;
>  	ret = mmu_zap_unsync_children(kvm, sp);
>  	kvm_mmu_page_unlink_children(kvm, sp);
>  	kvm_mmu_unlink_parents(kvm, sp);
>  	kvm_flush_remote_tlbs(kvm);
> +	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
> +		if (sp->spt[i] & PT_DIRTY_MASK)
> +			mark_page_dirty(kvm, sp->gfns[i]);
> +	}

Also need to transfer dirty bit in other places probably.

>  	if (!sp->role.invalid && !sp->role.direct)
>  		unaccount_shadowed(kvm, sp->gfn);
>  	if (sp->unsync)
> @@ -1676,7 +1707,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
>  	 * whether the guest actually used the pte (in order to detect
>  	 * demand paging).
>  	 */
> -	spte = shadow_base_present_pte | shadow_dirty_mask;
> +	spte = shadow_base_present_pte;
> +	if (!(spte & SPTE_DONT_DIRTY))
> +		spte |= shadow_dirty_mask;
> +
>  	if (!speculative)
>  		spte |= shadow_accessed_mask;
>  	if (!dirty)
> @@ -1725,8 +1759,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
>  		}
>  	}
>  
> -	if (pte_access & ACC_WRITE_MASK)
> -		mark_page_dirty(vcpu->kvm, gfn);
> +	if (!shadow_dirty_mask) {
> +		if (pte_access & ACC_WRITE_MASK)
> +			mark_page_dirty(vcpu->kvm, gfn);
> +	}

You can avoid the mark_page_dirty if SPTE_DONT_DIRTY? (which is a good idea,
gfn_to_memslot_unaliased and friends show up high in profiling).

>  set_pte:
>  	set_shadow_pte(shadow_pte, spte);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 37397f6..5b6351e 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -2724,6 +2724,11 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
>  	return 0;
>  }
>  
> +static int svm_dirty_bit_support(void)
> +{
> +	return 1;
> +}
> +
>  static struct kvm_x86_ops svm_x86_ops = {
>  	.cpu_has_kvm_support = has_svm,
>  	.disabled_by_bios = is_disabled,
> @@ -2785,6 +2790,8 @@ static struct kvm_x86_ops svm_x86_ops = {
>  	.set_tss_addr = svm_set_tss_addr,
>  	.get_tdp_level = get_npt_level,
>  	.get_mt_mask = svm_get_mt_mask,
> +
> +	.dirty_bit_support = svm_dirty_bit_support,
>  };
>  
>  static int __init svm_init(void)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 673bcb3..8903314 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -3774,6 +3774,11 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
>  	return ret;
>  }
>  
> +static int vmx_dirty_bit_support(void)
> +{
> +	return false;
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>  	.disabled_by_bios = vmx_disabled_by_bios,
> @@ -3833,6 +3838,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
>  	.set_tss_addr = vmx_set_tss_addr,
>  	.get_tdp_level = get_ept_level,
>  	.get_mt_mask = vmx_get_mt_mask,
> +
> +	.dirty_bit_support = vmx_dirty_bit_support,
>  };
>  
>  static int __init vmx_init(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 272e2e8..e06387c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1963,6 +1963,20 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
>  	return 0;
>  }
>  
> +void kvm_arch_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
> +{
> +	int i;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	for (i = 0; i < memslot->npages; ++i) {
> +		if (!test_bit(i, memslot->dirty_bitmap)) {
> +			if (is_dirty_and_clean_rmapp(kvm, &memslot->rmap[i]))
> +				set_bit(i, memslot->dirty_bitmap);
> +		}
> +	}
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
>  /*
>   * Get (and clear) the dirty memory log for a memory slot.
>   */
> @@ -1982,10 +1996,14 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
>  
>  	/* If nothing is dirty, don't bother messing with page tables. */
>  	if (is_dirty) {
> -		spin_lock(&kvm->mmu_lock);
> -		kvm_mmu_slot_remove_write_access(kvm, log->slot);
> -		spin_unlock(&kvm->mmu_lock);
> -		kvm_flush_remote_tlbs(kvm);
> +		if (!kvm_x86_ops->dirty_bit_support()) {
> +			spin_lock(&kvm->mmu_lock);
> +			/*  remove_write_access() flush the tlb */
> +			kvm_mmu_slot_remove_write_access(kvm, log->slot);
> +			spin_unlock(&kvm->mmu_lock);
> +		} else {
> +			kvm_flush_remote_tlbs(kvm);
> +		}
>  		memslot = &kvm->memslots[log->slot];
>  		n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
>  		memset(memslot->dirty_bitmap, 0, n);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index cdf1279..d1657a3 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -250,6 +250,7 @@ int kvm_dev_ioctl_check_extension(long ext);
>  
>  int kvm_get_dirty_log(struct kvm *kvm,
>  			struct kvm_dirty_log *log, int *is_dirty);
> +void kvm_arch_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
>  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
>  				struct kvm_dirty_log *log);
>  
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 3046e9c..a876231 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1135,8 +1135,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  	}
>  
>  	/* Free page dirty bitmap if unneeded */
> -	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
> +	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) {
>  		new.dirty_bitmap = NULL;
> +		if (old.flags & KVM_MEM_LOG_DIRTY_PAGES)
> +			kvm_arch_flush_shadow(kvm);
> +	}

Whats this for?

The idea of making dirty bit accessible (also can use it to map host
ptes read-only, when guest fault is read-only, for KSM (*)) is good. But
better first introduce infrastructure to handle dirty bit (making sure
the bit is transferred properly), then have logging make use of it.

* EPT violations do no transfer fault information to the page fault
handler, but its available (there's a vm-exit field).

>  	r = -ENOMEM;
>  
> @@ -1279,6 +1282,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
>  	if (!memslot->dirty_bitmap)
>  		goto out;
>  
> +	kvm_arch_get_dirty_log(kvm, memslot);
>  	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
>  
>  	for (i = 0; !any && i < n/sizeof(long); ++i)
> -- 
> 1.5.6.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html