Re: [PATCH 3/6] KVM: Dirty memory tracking for performant checkpointing and improved live migration

"Cao, Lei" <Lei.Cao@xxxxxxxxxxx> · Thu, 28 Apr 2016 19:58:03 +0000

On 4/28/2016 5:13 AM, Huang, Kai wrote:
> Hi,
> 
> On 4/27/2016 7:24 AM, Cao, Lei wrote:
>> Implement the remaining memory tracking API.
>>
>> Signed-off-by: Lei Cao <lei.cao@xxxxxxxxxxx>
>> ---
>>  arch/x86/include/asm/kvm_host.h |   5 +
>>  arch/x86/kvm/mmu.c              |  93 +++++
>>  include/uapi/linux/kvm.h        |   4 +-
>>  virt/kvm/kvm_main.c             | 610 +++++++++++++++++++++++++++++-
>>  4 files changed, 699 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index b7e3944..52bff2b 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1030,6 +1030,11 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
>>  				   gfn_t gfn_offset, unsigned long mask);
>>  void kvm_mmu_zap_all(struct kvm *kvm);
>>  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
>> +void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm);
>> +void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm);
>> +int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset);
>> +gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset);
>> +
>>  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
>>  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 1ff4dbb..a36475a 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -1443,6 +1443,58 @@ restart:
>>  	return 0;
>>  }
>>
>> +static struct kvm_memory_slot *kvm_memslot_from_id(struct kvm *kvm, int slot_id)
>> +{
>> +	int i;
>> +	struct kvm_memory_slot *memslot;
>> +	struct kvm_memslots *slots;
>> +
>> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
>> +		slots = __kvm_memslots(kvm, i);
>> +		kvm_for_each_memslot(memslot, slots) {
>> +			if (memslot->id == slot_id)
>> +				return memslot;
>> +		}
>> +	}
>> +	return NULL;
>> +}
>> +
>> +gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset)
>> +{
>> +	struct kvm_memory_slot *slot;
>> +	int slot_id;
>> +	gfn_t offset;
>> +
>> +	slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset);
>> +	slot = kvm_memslot_from_id(kvm, slot_id);
>> +	if (slot == NULL) {
>> +		pr_warn("KVM: bad slot_id %d\n", slot_id);
>> +		return kvm->mt.max_gfn+1;
>> +	}
>> +	offset  = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset);
>> +	return offset + slot->base_gfn;
>> +}
>> +
>> +int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset)
>> +{
>> +	struct kvm_memory_slot *slot;
>> +	int slot_id;
>> +	gfn_t offset, gfn;
>> +
>> +	slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset);
>> +	slot = kvm_memslot_from_id(kvm, slot_id);
>> +	offset  = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset);
>> +	gfn = offset + slot->base_gfn;
>> +
>> +	if (gfn > kvm->mt.max_gfn) {
>> +		pr_warn("KVM: bad gfn %lx\n", (long)gfn);
>> +		return 0;
>> +	}
>> +
>> +	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, slot, offset, 1);
>> +	return 1;
>> +}
>> +
>>  struct slot_rmap_walk_iterator {
>>  	/* input fields. */
>>  	struct kvm_memory_slot *slot;
>> @@ -4762,6 +4814,47 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
>>  		kvm_flush_remote_tlbs(kvm);
>>  }
>>
>> +void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm)
>> +{
>> +	int i;
>> +	struct kvm_memslots *slots;
>> +	struct kvm_memory_slot *memslot;
>> +
>> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
>> +		slots = __kvm_memslots(kvm, i);
>> +
>> +		kvm_for_each_memslot(memslot, slots) {
>> +			if (memslot->id < KVM_USER_MEM_SLOTS) {
>> +				if (kvm_x86_ops->slot_enable_log_dirty)
>> +					kvm_x86_ops->slot_enable_log_dirty(kvm,
>> +						memslot);
>> +				else
>> +					kvm_mmu_slot_remove_write_access(kvm,
>> +						memslot);
>> +			}
>> +		}
>> +	}
>> +}
>> +
>> +void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm)
>> +{
>> +	int i;
>> +	struct kvm_memslots *slots;
>> +	struct kvm_memory_slot *memslot;
>> +
>> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
>> +		slots = __kvm_memslots(kvm, i);
>> +
>> +		kvm_for_each_memslot(memslot, slots) {
>> +			if (memslot->id < KVM_USER_MEM_SLOTS) {
>> +				if (kvm_x86_ops->slot_disable_log_dirty)
>> +					kvm_x86_ops->slot_disable_log_dirty(kvm,
>> +						memslot);
>> +			}
>> +		}
>> +	}
>> +}
>> +
>>  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
>>  					 struct kvm_rmap_head *rmap_head)
>>  {
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 2bce4db..736668d 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -1344,11 +1344,11 @@ struct mt_enable {
>>  #define MT_OFFSET_MASK		(0x0000ffffffffffffUL)
>>
>>  #define MT_MAKE_SLOT_OFFSET(slot, offset)			\
>> -	do {							\
>> +	({							\
>>  		__u64 slot_off = offset & MT_OFFSET_MASK;	\
>>  		slot_off |= ((__u64)slot << 48);		\
>>  		slot_off;					\
>> -	} while (0)
>> +	})
>>
>>  #define MT_OFFSET_FROM_SLOT_OFFSET(slot_off)		\
>>  	(slot_off & MT_OFFSET_MASK)
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index fe46067..ba99cbc6 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -1795,8 +1795,12 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
>>
>> -static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
>> -			          const void *data, int offset, int len)
>> +static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>> +	gfn_t gfn, struct kvm_vcpu *vcpu);
> 
> One general comment: won't it be better if you devide kvm_mt and make it 
> embedded to kvm_memory_slot? In my understanding the main difference 
> between bitmap and your log-dirty mechanism is you are using list not 
> bitmap, and I think make the dirty_gfn_list embedded to kvm_memory_slot 
> should simplify lots of your code.
> 
> Thanks,
> -Kai
> 

It's true that one difference of the new mechanism is the use of list 
instead of bitmap. Another difference is that the dirty list is per
vcpu, instead of per memory slot. This is so that the list can be updated
without holding a lock. 

It should be noted that what is saved on the dirty list is 
(mem slot id|offset), not gfn. (See mt_mark_page_dirty()) Dirty list is 
in fact named "gfnlist" throughout the code, it probably causes confusion.
I'll fix it.

>> +
>> +static int __kvm_write_guest_page(struct kvm *kvm,
>> +				struct kvm_memory_slot *memslot, gfn_t gfn,
>> +				const void *data, int offset, int len)
>>  {
>>  	int r;
>>  	unsigned long addr;
>> @@ -1808,6 +1812,8 @@ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
>>  	if (r)
>>  		return -EFAULT;
>>  	mark_page_dirty_in_slot(memslot, gfn);
>> +	if (memslot && (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS))
>> +		mt_mark_page_dirty(kvm, memslot, gfn, NULL);
>>  	return 0;
>>  }
>>
>> @@ -1816,7 +1822,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
>>  {
>>  	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
>>
>> -	return __kvm_write_guest_page(slot, gfn, data, offset, len);
>> +	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_write_guest_page);
>>
>> @@ -1825,7 +1831,7 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
>>  {
>>  	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>>
>> -	return __kvm_write_guest_page(slot, gfn, data, offset, len);
>> +	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
>>
>> @@ -1929,6 +1935,10 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
>>  	if (r)
>>  		return -EFAULT;
>>  	mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
>> +	if (ghc->memslot && (ghc->memslot->id >= 0 &&
>> +		ghc->memslot->id < KVM_USER_MEM_SLOTS))
>> +		mt_mark_page_dirty(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT,
>> +			NULL);
>>
>>  	return 0;
>>  }
>> @@ -1996,11 +2006,95 @@ static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
>>  	}
>>  }
>>
>> +/*
>> + * We have some new dirty pages for our sublist waiter.  Enough to merit
>> + * waking it up?
>> + */
>> +static void mt_sw_add_pages(struct kvm *kvm)
>> +{
>> +	int avail = kvm->mt.tot_pages - kvm->mt.fetch_count;
>> +	struct sublist_waiter *swp = &kvm->mt.sw;
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +
>> +	if (swp->goal && (avail >= swp->goal)) {
>> +		kvm->mt.fetch_count += avail;
>> +		swp->goal = 0;
>> +		wake_up(&swp->wq);
>> +	}
>> +
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +}
>> +
>> +#define DIRTY_GFN_ADD_GRANULARITY      (256)
>> +
>> +static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>> +	gfn_t gfn, struct kvm_vcpu *vcpu)
>> +{
>> +	int use_kvm;            /* add to global list? */
>> +	struct gfn_list *gfnlist;
>> +	int slot_id = slot->id;
>> +	__u64 offset = gfn - slot->base_gfn;
>> +	__u64 slot_offset;
>> +
>> +	/*
>> +	 * Try to add dirty page to vcpu list.  If vcpu is NULL or
>> +	 * vcpu list is full, then try to add to kvm master list.
>> +	 */
>> +
>> +	if (!kvm->mt.active)
>> +		return;
>> +
>> +	if (slot->id >= KVM_USER_MEM_SLOTS)
>> +		return;
>> +
>> +	if (gfn > kvm->mt.max_gfn)
>> +		return;
>> +
>> +	/* if we're avoiding duplicates, is this one already marked? */
>> +	if (kvm->mt.bmap && test_and_set_bit(gfn, kvm->mt.bmap))
>> +		return;
>> +
>> +	slot_offset = MT_MAKE_SLOT_OFFSET(slot_id, offset);
>> +
>> +	use_kvm = (vcpu == NULL);
>> +
>> +	if (vcpu) {
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +		if (gfnlist->dirty_index == gfnlist->max_dirty) {
>> +			use_kvm = 1;
>> +			gfnlist->overflow = 1;
>> +			/* Fall back to master gfn list.*/
>> +			gfnlist = &kvm->mt.gfn_list;
>> +		}
>> +	} else {
>> +		gfnlist = &kvm->mt.gfn_list;
>> +	}
>> +
>> +	spin_lock(&gfnlist->lock);
>> +	if (gfnlist->dirty_index >= gfnlist->max_dirty) {
>> +		gfnlist->overflow = 1;
>> +	} else {
>> +		gfnlist->dirty_gfns[gfnlist->dirty_index++] = slot_offset;
>> +		if ((gfnlist->dirty_index % DIRTY_GFN_ADD_GRANULARITY) == 0) {
>> +			spin_lock(&kvm->mt.lock);
>> +			kvm->mt.tot_pages += DIRTY_GFN_ADD_GRANULARITY;
>> +			mt_sw_add_pages(kvm);
>> +			spin_unlock(&kvm->mt.lock);
>> +		}
>> +	}
>> +	spin_unlock(&gfnlist->lock);
>> +}
>> +
>>  void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
>>  {
>>  	struct kvm_memory_slot *memslot;
>>
>>  	memslot = gfn_to_memslot(kvm, gfn);
>> +	if (memslot) {
>> +		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
>> +			mt_mark_page_dirty(kvm, memslot, gfn, NULL);
>> +	}
>>  	mark_page_dirty_in_slot(memslot, gfn);
>>  }
>>  EXPORT_SYMBOL_GPL(mark_page_dirty);
>> @@ -2010,6 +2104,10 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
>>  	struct kvm_memory_slot *memslot;
>>
>>  	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>> +	if (memslot) {
>> +		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
>> +			mt_mark_page_dirty(vcpu->kvm, memslot, gfn, vcpu);
>> +	}
>>  	mark_page_dirty_in_slot(memslot, gfn);
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
>> @@ -2823,8 +2921,6 @@ static u64 kvm_get_max_gfn(struct kvm *kvm)
>>  	return num_gfn - 1;
>>  }
>>
>> -#define DIRTY_GFN_ADD_GRANULARITY      (256)
>> -
>>  /*
>>   * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal.
>>   */
>> @@ -3010,31 +3106,523 @@ static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts)
>>  		return -EINVAL;
>>  }
>>
>> +static int kvm_enable_mt(struct kvm *kvm)
>> +{
>> +	int rc = 0;
>> +
>> +	if (kvm->mt.active) {
>> +		pr_warn("KVM: vm %d, MT already active\n",
>> +			current->pid);
>> +		rc = -EINVAL;
>> +		goto enable_mt_done;
>> +	}
>> +
>> +	kvm_mmu_mt_enable_log_dirty(kvm);
>> +	if (kvm->mt.bmap)
>> +		memset(kvm->mt.bmap, 0, kvm->mt.bmapsz);
>> +
>> +	kvm->mt.active = 1;
>> +
>> +enable_mt_done:
>> +
>> +	return rc;
>> +}
>> +
>> +static int kvm_disable_mt(struct kvm *kvm)
>> +{
>> +	int rc = 0;
>> +
>> +	if (!kvm->mt.active) {
>> +		pr_warn("KVM: vm %d, MT already disabled\n",
>> +			current->pid);
>> +		rc = -EINVAL;
>> +		goto disable_mt_done;
>> +	}
>> +
>> +	kvm_mmu_mt_disable_log_dirty(kvm);
>> +	kvm->mt.active = 0;
>> +
>> +disable_mt_done:
>> +
>> +	return rc;
>> +}
>> +
>>  static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte)
>>  {
>> -	return -EINVAL;
>> +	if ((mte->flags & 0x1) == 1)
>> +		return kvm_enable_mt(kvm);
>> +	else if ((mte->flags & 0x1) == 0)
>> +		return kvm_disable_mt(kvm);
>> +	else
>> +		return -EINVAL;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_prepare_cp(struct kvm *kvm,
>>  				      struct mt_prepare_cp *mtpcp)
>>  {
>> -	return -EINVAL;
>> +	int i;
>> +	struct kvm_vcpu *vcpu;
>> +	struct gfn_list *gfnlist;
>> +
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	kvm->mt.cp_id = mtpcp->cpid;
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +		spin_lock(&gfnlist->lock);
>> +		gfnlist->fetch_index = 0;
>> +		gfnlist->reset_index = 0;
>> +		gfnlist->dirty_index = 0;
>> +		gfnlist->overflow = 0;
>> +		spin_unlock(&gfnlist->lock);
>> +	}
>> +
>> +	gfnlist = &kvm->mt.gfn_list;
>> +	spin_lock(&gfnlist->lock);
>> +	gfnlist->fetch_index = 0;
>> +	gfnlist->reset_index = 0;
>> +	gfnlist->dirty_index = 0;
>> +	gfnlist->overflow = 0;
>> +	spin_unlock(&gfnlist->lock);
>> +
>> +	kvm->mt.quiesced = 0;
>> +	kvm->mt.allow_blocking = 1;
>> +	kvm->mt.tot_pages  = kvm->mt.fetch_count = 0;
>> +
>> +	return 0;
>> +}
>> +
>> +static bool mt_reset_gfn(struct kvm *kvm, u64 slot_offset)
>> +{
>> +	gfn_t gfn;
>> +
>> +	gfn = kvm_mt_slot_offset_to_gfn(kvm, slot_offset);
>> +	if (gfn > kvm->mt.max_gfn)
>> +		return 0;
>> +
>> +	if (kvm->mt.bmap) {
>> +		if (kvm->mt.quiesced) {
>> +			/*
>> +			 * Goal is to reset entire bmap, but don't need
>> +			 * atomics if we are quiesced
>> +			 */
>> +			int offset32 = gfn/32;
>> +			int *p = (int *)(kvm->mt.bmap) + offset32;
>> +			*p = 0;
>> +		} else {
>> +			clear_bit(gfn, kvm->mt.bmap);
>> +		}
>> +	}
>> +
>> +	return kvm_mt_mmu_reset_gfn(kvm, slot_offset);
>> +}
>> +
>> +#define GFN_RESET_BATCH        (64)
>> +
>> +static int mt_reset_all_gfns(struct kvm *kvm)
>> +{
>> +	int i, j;
>> +	struct kvm_vcpu *vcpu;
>> +	struct gfn_list *gfnlist;
>> +	bool cleared = false;
>> +	int reset_start, count, avail;
>> +
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	if (!kvm->mt.quiesced)
>> +		return -EINVAL;
>> +
>> +	spin_lock(&kvm->mmu_lock);
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +
>> +vcpu_gfn_loop:
>> +
>> +		spin_lock(&gfnlist->lock);
>> +		reset_start = gfnlist->reset_index;
>> +		avail = gfnlist->dirty_index - gfnlist->reset_index;
>> +		count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail;
>> +		gfnlist->reset_index += count;
>> +		spin_unlock(&gfnlist->lock);
>> +
>> +		for (j = reset_start; j < reset_start + count; j++)
>> +			cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]);
>> +
>> +		if (count)
>> +			goto vcpu_gfn_loop;
>> +	}
>> +
>> +	gfnlist = &kvm->mt.gfn_list;
>> +
>> +global_gfn_loop:
>> +
>> +	spin_lock(&gfnlist->lock);
>> +	reset_start = gfnlist->reset_index;
>> +	avail = gfnlist->dirty_index - gfnlist->reset_index;
>> +	count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail;
>> +	gfnlist->reset_index += count;
>> +	spin_unlock(&gfnlist->lock);
>> +
>> +	for (j = reset_start; j < reset_start + count; j++)
>> +		cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]);
>> +
>> +	if (count)
>> +		goto global_gfn_loop;
>> +
>> +	spin_unlock(&kvm->mmu_lock);
>> +
>> +
>> +	if (cleared)
>> +		kvm_flush_remote_tlbs(kvm);
>> +
>> +	return 0;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_rearm_gfns(struct kvm *kvm)
>>  {
>> -	return -EINVAL;
>> +	return mt_reset_all_gfns(kvm);
>> +}
>> +
>> +static int mt_unblock_sw(struct kvm *kvm)
>> +{
>> +	struct sublist_waiter *swp;
>> +
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +
>> +	kvm->mt.allow_blocking = 0;
>> +
>> +	/* Make sure allow_blocking is clear before the wake up */
>> +	mb();
>> +
>> +	swp = &kvm->mt.sw;
>> +	wake_up(&swp->wq);
>> +
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +
>> +	return 0;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_quiesced(struct kvm *kvm)
>>  {
>> -	return -EINVAL;
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	kvm->mt.quiesced = 1;
>> +
>> +	/* wake up the sublist waiter */
>> +	mt_unblock_sw(kvm);
>> +
>> +	if (kvm->mt.gfn_list.overflow)
>> +		return -ENOMEM;
>> +
>> +	return 0;
>> +}
>> +
>> +static int mt_sublist_req_nowait(struct kvm *kvm,
>> +				struct mt_sublist_fetch_info *msfi, int offset)
>> +{
>> +	int i, j, avail, goal = msfi->gfn_info.count;
>> +	struct kvm_vcpu *vcpu;
>> +	__u64 *gfndst, *gfnsrc;
>> +	int rc = 0;
>> +	__u64 slot_offset;
>> +	int index;
>> +
>> +	/* Clearing dirty/write bits requires tlb flush before exit */
>> +	int cleared = 0;
>> +
>> +	/* Don't need to lock gfn lists if we're in VM blackout */
>> +	int need_locks = !kvm->mt.quiesced;
>> +
>> +	/* Consolidate flags */
>> +	int reset = msfi->flags & MT_FETCH_REARM;
>> +	int bmap = kvm->mt.bmap != NULL;
>> +
>> +	if (goal == 0)
>> +		return 0;
>> +
>> +	gfndst = &msfi->gfn_info.gfnlist[offset];
>> +	msfi->gfn_info.count = offset;
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		int len, rem;
>> +		int vcpu_id;
>> +		struct gfn_list *gfnlist;
>> +
>> +		vcpu_id = vcpu->vcpu_id;
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu_id].gfn_list;
>> +
>> +		mutex_lock(&gfnlist->mtx);
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +
>> +		avail = gfnlist->dirty_index - gfnlist->fetch_index;
>> +		if (!avail) {
>> +			if (need_locks)
>> +				spin_unlock(&gfnlist->lock);
>> +				mutex_unlock(&gfnlist->mtx);
>> +			continue;
>> +		}
>> +		avail = avail > goal ? goal : avail;
>> +		for (j = 0; j < avail; j++) {
>> +			index = gfnlist->fetch_index+j;
>> +			slot_offset = gfnlist->dirty_gfns[index];
>> +			kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm,
>> +						slot_offset);
>> +		}
>> +		gfnsrc = &kvm->mt.gfn_buf[0];
>> +
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +
>> +		rem = copy_to_user(gfndst, gfnsrc,
>> +				avail*sizeof(*gfndst)) / sizeof(*gfndst);
>> +
>> +		/*
>> +		 * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn
>> +		 * below, but must take mmu_lock _before_ gfnlist lock.
>> +		 */
>> +		if (reset)
>> +			spin_lock(&kvm->mmu_lock);
>> +
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +
>> +		len = avail - rem;
>> +		msfi->gfn_info.count += len;
>> +		gfndst += len;
>> +		if (reset) {
>> +			__u64 gfn;
>> +
>> +			for (j = 0; j < len; j++) {
>> +				index = gfnlist->fetch_index+j;
>> +				slot_offset = gfnlist->dirty_gfns[index];
>> +				gfn = kvm_mt_slot_offset_to_gfn(kvm,
>> +					slot_offset);
>> +				cleared +=
>> +					kvm_mt_mmu_reset_gfn(kvm, slot_offset);
>> +				if (bmap)
>> +					clear_bit(gfn, kvm->mt.bmap);
>> +			}
>> +			gfnlist->reset_index += len;
>> +		}
>> +		gfnlist->fetch_index += len;
>> +
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +		if (reset)
>> +			spin_unlock(&kvm->mmu_lock);
>> +		mutex_unlock(&gfnlist->mtx);
>> +
>> +		if (len != avail) {
>> +			rc = -EFAULT;
>> +			goto copy_done_err;
>> +		}
>> +
>> +		goal -= avail;
>> +		if (goal == 0)
>> +			break;
>> +	}
>> +
>> +	/* If we still need more gfns, consult the master list */
>> +	if (goal) {
>> +		int len, rem;
>> +		struct gfn_list *gfnlist = &kvm->mt.gfn_list;
>> +
>> +		mutex_lock(&gfnlist->mtx);
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +
>> +		avail = gfnlist->dirty_index - gfnlist->fetch_index;
>> +		if (!avail) {
>> +			if (need_locks)
>> +				spin_unlock(&gfnlist->lock);
>> +			mutex_unlock(&gfnlist->mtx);
>> +			goto copy_done_no_err;
>> +		}
>> +		avail = avail > goal ? goal : avail;
>> +		for (j = 0; j < avail; j++) {
>> +			index = gfnlist->fetch_index+j;
>> +			slot_offset = gfnlist->dirty_gfns[index];
>> +			kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm,
>> +						slot_offset);
>> +		}
>> +		gfnsrc = &kvm->mt.gfn_buf[0];
>> +
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +
>> +		rem = copy_to_user(gfndst, gfnsrc,
>> +				avail*sizeof(*gfndst)) / sizeof(*gfndst);
>> +
>> +		/*
>> +		 * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn
>> +		 * below, but must take mmu_lock _before_ gfnlist lock.
>> +		 */
>> +		if (reset)
>> +			spin_lock(&kvm->mmu_lock);
>> +
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +
>> +		len = avail - rem;
>> +		msfi->gfn_info.count += len;
>> +		gfnlist->fetch_index += len;
>> +		if (reset) {
>> +			__u64 slot_offset;
>> +			__u64 gfn;
>> +
>> +			for (j = 0; j < len; j++) {
>> +				index = gfnlist->fetch_index+j;
>> +				slot_offset = gfnlist->dirty_gfns[index];
>> +				gfn = kvm_mt_slot_offset_to_gfn(kvm,
>> +					slot_offset);
>> +				cleared +=
>> +					kvm_mt_mmu_reset_gfn(kvm, slot_offset);
>> +				if (bmap)
>> +					clear_bit(gfn, kvm->mt.bmap);
>> +			}
>> +			gfnlist->reset_index += len;
>> +		}
>> +
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +		if (reset)
>> +			spin_unlock(&kvm->mmu_lock);
>> +		mutex_unlock(&gfnlist->mtx);
>> +
>> +		if (len != avail) {
>> +			rc = -EFAULT;
>> +			goto copy_done_err;
>> +		}
>> +
>> +		goal -= avail;
>> +	}
>> +
>> +copy_done_no_err:
>> +
>> +copy_done_err:
>> +
>> +	if (cleared)
>> +		kvm_flush_remote_tlbs(kvm);
>> +
>> +	return rc;
>> +}
>> +
>> +static int mt_sublist_req_wait(struct kvm *kvm,
>> +				struct mt_sublist_fetch_info *msfi)
>> +{
>> +	struct sublist_waiter *swp;
>> +	int goal = msfi->gfn_info.count;
>> +	int offset;
>> +	int rc;
>> +
>> +	if (msfi->gfn_info.count == 0)
>> +		return 0;
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +	if (!kvm->mt.allow_blocking) {
>> +		spin_unlock(&kvm->mt.sw_lock);
>> +		return -EINVAL;
>> +	}
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +
>> +	rc = mt_sublist_req_nowait(kvm, msfi, 0);
>> +	if (rc || (msfi->gfn_info.count == goal))
>> +		return rc;
>> +
>> +	offset = msfi->gfn_info.count;
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +
>> +	if (kvm->mt.sw_busy) {
>> +		spin_unlock(&kvm->mt.sw_lock);
>> +		return -EBUSY;
>> +	}
>> +	kvm->mt.sw_busy = 1;
>> +
>> +	swp = &kvm->mt.sw;
>> +	swp->goal = goal;
>> +
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +
>> +	rc = wait_event_interruptible(swp->wq,
>> +			!kvm->mt.allow_blocking || !swp->goal);
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +
>> +	kvm->mt.sw_busy = 0;
>> +
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +
>> +	if (rc)
>> +		return rc;
>> +
>> +	msfi->gfn_info.count = goal - offset;
>> +
>> +	return mt_sublist_req_nowait(kvm, msfi, offset);
>> +}
>> +
>> +static int mt_get_dirty_count(struct kvm *kvm,
>> +				struct mt_sublist_fetch_info *msfi)
>> +{
>> +	int i, avail = 0;
>> +	struct kvm_vcpu *vcpu;
>> +	struct gfn_list *gfnlist;
>> +
>> +	/* Don't need to lock gfn lists if we're in VM blackout */
>> +	int need_locks = !kvm->mt.quiesced;
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +
>> +		mutex_lock(&gfnlist->mtx);
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +		avail += gfnlist->dirty_index - gfnlist->fetch_index;
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +		mutex_unlock(&gfnlist->mtx);
>> +	}
>> +
>> +	gfnlist = &kvm->mt.gfn_list;
>> +
>> +	mutex_lock(&gfnlist->mtx);
>> +	if (need_locks)
>> +		spin_lock(&gfnlist->lock);
>> +	avail += gfnlist->dirty_index - gfnlist->fetch_index;
>> +	if (need_locks)
>> +		spin_unlock(&gfnlist->lock);
>> +	mutex_unlock(&gfnlist->mtx);
>> +
>> +	msfi->gfn_info.count = avail;
>> +
>> +	return 0;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_sublist_fetch(struct kvm *kvm,
>>  					 struct mt_sublist_fetch_info *mtsfi)
>>  {
>> -	return -EINVAL;
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	if (mtsfi->gfn_info.gfnlist == NULL)
>> +		return mt_get_dirty_count(kvm, mtsfi);
>> +
>> +	if (mtsfi->gfn_info.count == 0)
>> +		return 0;
>> +
>> +	if (!(mtsfi->flags & MT_FETCH_WAIT))
>> +		return mt_sublist_req_nowait(kvm, mtsfi, 0);
>> +
>> +	return mt_sublist_req_wait(kvm, mtsfi);
>>  }
>>
>>  static int kvm_vm_ioctl_mt_dirty_trigger(struct kvm *kvm, int dirty_trigger)
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html