Re: [PATCH v2 2/4] KVM: Dirty memory tracking for performant checkpointing solutions

Paolo Bonzini <pbonzini@xxxxxxxxxx> · Thu, 5 Jan 2017 11:15:23 +0100

On 04/01/2017 21:43, Cao, Lei wrote:
> Introduce memory tracking data structures and implement the new ioctls
> and support functions.
> 
> Signed-off-by: Lei Cao <lei.cao@xxxxxxxxxxx>
> ---
>  include/linux/kvm_host.h |  25 ++++++
>  virt/kvm/kvm_main.c      | 220 +++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 239 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1c5190d..7a85b30 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -204,6 +204,22 @@ struct kvm_mmio_fragment {
>  	unsigned len;
>  };
>  
> +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
> +struct dirty_gfn_t {
> +	__u32 slot; /* as_id | slot_id */
> +	__u32 pad;
> +	__u64 offset;
> +};
> +
> +struct gfn_list_t {
> +	__u32 dirty_index; /* where to put the next dirty GFN */
> +	__u32 avail_index; /* GFNs before this can be harvested */
> +	__u32 fetch_index; /* the next GFN to be harvested */
> +	__u32 pad;
> +	struct dirty_gfn_t dirty_gfns[0];
> +};

These are part of the userspace API, so they go in
include/uapi/linux/kvm.h and they should be included unconditionally.
Also, as mentioned (indirectly) in my reply to the cover letter, the
types for userspace API should start with "kvm_".

Please document in include/linux/kvm.h or virt/kvm/kvm_main.c the
requirements for enabling KVM_DIRTY_LOG_PAGE_OFFSET.  As far as I can
see there is at least the following:

- any memory accesses done by KVM should use kvm_vcpu_write_* instead of
kvm_write_* if possible, otherwise the per-VM log will fill too fast

- you should provide kvm_arch_mmu_enable_log_dirty_pt_masked and you
should not have a separate step to synchronize a hardware dirty bitmap
with KVM's

The required size of the global log probably correlates to the number of
VCPUs, so what is a good rule of thumb to size the logs?  Could it be
something like "M times the number of VCPUs, plus N" (and document the M
and N of course)?

> @@ -1996,6 +2038,9 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
>  	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
>  	if (r)
>  		return -EFAULT;
> +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
> +	kvm_mt_mark_page_dirty(kvm, ghc->memslot, NULL, gpa >> PAGE_SHIFT);
> +#endif

Please add the kvm and vcpu arguments to mark_page_dirty_in_slot, since
you're modifying all four calls to mark_page_dirty_in_slot itself.

Furthermore, mark_page_dirty_in_slot already has a lot of ingredients
that remove duplicated code:

- it checks for memslot != NULL

- it checks for memslot->dirty_bitmap

- the check for memslot->dirty_bitmap makes it unnecessary to check
memslot->id against KVM_USER_MEM_SLOTS.

>  	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
>  
>  	return 0;
> @@ -2076,6 +2121,12 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
>  	struct kvm_memory_slot *memslot;
>  
>  	memslot = gfn_to_memslot(kvm, gfn);
> +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
> +	if (memslot) {
> +		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
> +			kvm_mt_mark_page_dirty(kvm, memslot, NULL, gfn);
> +	}
> +#endif
>  	mark_page_dirty_in_slot(memslot, gfn);
>  }
>  EXPORT_SYMBOL_GPL(mark_page_dirty);
> @@ -2085,6 +2136,13 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
>  	struct kvm_memory_slot *memslot;
>  
>  	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
> +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
> +	if (memslot) {
> +		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
> +			kvm_mt_mark_page_dirty(vcpu->kvm, memslot,
> +					       vcpu, gfn);
> +	}
> +#endif
>  	mark_page_dirty_in_slot(memslot, gfn);
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
> @@ -2377,6 +2435,32 @@ static const struct vm_operations_struct kvm_vcpu_vm_ops = {
>  
>  static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
> +	struct kvm_vcpu *vcpu = file->private_data;
> +	unsigned long start, pfn, size;
> +	int ret;
> +	struct gfn_list_t *log;
> +
> +	if (vcpu->kvm->dirty_log_size) {
> +		size = vcpu->kvm->dirty_log_size;
> +		start = vma->vm_start +
> +			KVM_DIRTY_LOG_PAGE_OFFSET*PAGE_SIZE;
> +		log = vcpu->kvm->dirty_logs;
> +		pfn = page_to_pfn(virt_to_page((unsigned long)log));
> +		ret = remap_pfn_range(vma, start, pfn, size,
> +				      vma->vm_page_prot);
> +		if (ret)
> +			return ret;
> +		start = vma->vm_start +
> +			KVM_DIRTY_LOG_PAGE_OFFSET*PAGE_SIZE + size;
> +		log = vcpu->dirty_logs;
> +		pfn = page_to_pfn(virt_to_page((unsigned long)log));
> +		ret = remap_pfn_range(vma, start, pfn, size,
> +				      vma->vm_page_prot);
> +		if (ret)
> +			return ret;
> +	}

Please don't use remap_pfn_range; instead extend kvm_vcpu_fault.  For
the VM-wide log, let's mmap the KVM file descriptor instead.  Please add
a .mmap callback to kvm_vm_fops and define a new vm_operations_struct
kvm_vm_vm_ops.

> +#endif
>  	vma->vm_ops = &kvm_vcpu_vm_ops;
>  	return 0;
>  }
> @@ -2946,19 +3030,143 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>  }
>  
>  #ifdef KVM_DIRTY_LOG_PAGE_OFFSET
> +void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
> +	struct kvm_vcpu *vcpu, gfn_t gfn)
> +{
> +	struct gfn_list_t *gfnlist;
> +	int slot_id;
> +	u32 as_id = 0;
> +	u64 offset;
> +
> +	if (!slot || !slot->dirty_bitmap || !kvm->dirty_log_size)
> +		return;
> +
> +	offset = gfn - slot->base_gfn;
> +
> +	if (test_bit_le(offset, slot->dirty_bitmap))
> +		return;
> +
> +	slot_id = slot->id;
> +
> +	if (vcpu)
> +		as_id = kvm_arch_vcpu_memslots_id(vcpu);
> +
> +	if (vcpu)
> +		gfnlist = vcpu->dirty_logs;
> +	else
> +		gfnlist = kvm->dirty_logs;
> +
> +	if (gfnlist->dirty_index == kvm->max_dirty_logs) {
> +		printk(KERN_ERR "dirty log overflow\n");

This should be at least ratelimited, but maybe it should even trigger a
WARN_ONCE.

> +		return;
> +	}
> +
> +	if (!vcpu)
> +		spin_lock(&kvm->dirty_log_lock);
> +	gfnlist->dirty_gfns[gfnlist->dirty_index].slot =
> +		(as_id << 16) | slot_id;
> +	gfnlist->dirty_gfns[gfnlist->dirty_index].offset = offset;
> +	smp_wmb();
> +	gfnlist->dirty_index++;
> +	if (!vcpu)
> +		spin_unlock(&kvm->dirty_log_lock);
> +}
> +
>  static int kvm_mt_set_dirty_log_size(struct kvm *kvm, u32 size)
>  {
> -	return -EINVAL;
> +	struct page *page;
> +
> +	if (!size)
> +		return -EINVAL;
> +
> +	kvm->dirty_log_size = get_order(size) * PAGE_SIZE;
> +	kvm->max_dirty_logs = (kvm->dirty_log_size -
> +			       sizeof(struct gfn_list_t)) /
> +			      sizeof(struct dirty_gfn_t);
> +	page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));

Please use vmalloc.  We don't want userspace to trigger large-order
allocations.

> +	if (!page) {
> +		kvm_put_kvm(kvm);
> +		return -ENOMEM;
> +	}
> +	kvm->dirty_logs = page_address(page);
> +	spin_lock_init(&kvm->dirty_log_lock);
> +	return 0;
> +}
> +
> +static void kvm_mt_reset_gfn(struct kvm *kvm,
> +			     struct dirty_gfn_t *slot_offset)
> +{
> +	struct kvm_memory_slot *slot;
> +	int as_id, id;
> +
> +	as_id = slot_offset->slot >> 16;
> +	id = (u16)slot_offset->slot;
> +	slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
> +
> +	clear_bit_le(slot_offset->offset, slot->dirty_bitmap);
> +	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, slot,
> +						slot_offset->offset, 1);

In case the guest has locality, would it be beneficial to check for
consecutive dirtied pages, and only call
kvm_arch_mmu_enable_log_dirty_pt_masked once?

Paolo

>  }
>  
>  static int kvm_mt_reset_all_gfns(struct kvm *kvm)
>  {
> -	return -EINVAL;
> +	int i, j;
> +	struct kvm_vcpu *vcpu;
> +	int cleared = 0;
> +
> +	if (!kvm->dirty_log_size)
> +		return -EINVAL;
> +
> +	spin_lock(&kvm->mmu_lock);
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		for (j = 0;
> +		     j < vcpu->dirty_logs->dirty_index;
> +		     j++, cleared++)
> +			kvm_mt_reset_gfn(kvm,
> +					 &kvm->dirty_logs->dirty_gfns[j]);
> +		vcpu->dirty_logs->dirty_index = 0;
> +		vcpu->dirty_logs->avail_index = 0;
> +		vcpu->dirty_logs->fetch_index = 0;
> +	}
> +
> +	for (j = 0; j < kvm->dirty_logs->dirty_index; j++, cleared++)
> +		kvm_mt_reset_gfn(kvm, &kvm->dirty_logs->dirty_gfns[j]);
> +	kvm->dirty_logs->dirty_index = 0;
> +	kvm->dirty_logs->avail_index = 0;
> +	kvm->dirty_logs->fetch_index = 0;
> +
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (cleared)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	return 0;
>  }
>  
>  static int kvm_mt_get_dirty_count(struct kvm *kvm, u32 *count)
>  {
> -	return -EINVAL;
> +	int i, avail = 0;
> +	struct kvm_vcpu *vcpu;
> +
> +	if (!kvm->dirty_log_size)
> +		return -EINVAL;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		vcpu->dirty_logs->avail_index =
> +			READ_ONCE(vcpu->dirty_logs->dirty_index);
> +		avail += vcpu->dirty_logs->avail_index -
> +			 vcpu->dirty_logs->fetch_index;
> +	}
> +
> +	kvm->dirty_logs->avail_index =
> +		READ_ONCE(kvm->dirty_logs->dirty_index);
> +	avail += kvm->dirty_logs->avail_index -
> +		 kvm->dirty_logs->fetch_index;
> +
> +	*count = avail;
> +
> +	return 0;
>  }
>  #endif /* KVM_DIRTY_LOG_PAGE_OFFSET*/
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html