Re: [PATCH 2/6] KVM: Dirty memory tracking for performant checkpointing and improved live migration

"Cao, Lei" <Lei.Cao@xxxxxxxxxxx> · Thu, 28 Apr 2016 19:28:10 +0000

On 4/28/2016 5:05 AM, Huang, Kai wrote:
> Hi,
> 
> On 4/27/2016 7:23 AM, Cao, Lei wrote:
>> Introduce memory tracking data structures and implement the new
>> setup/cleanup ioctls.
>>
>> Chief among the KVM performance bottlenecks is the use of large
>> (VM-sized) bitmaps to convey the information about which pages of memory
>> are dirtied. These bitmaps are copied to user-space when QEMU queries
>> KVM for its dirty page information. The use of bitmaps makes sense in the
>> current live-migration method, as it is possible for all of memory to be
>> dirty from one log-dirty pass to another. But in a checkpoint system, the
>> number of dirty pages is bounded such that the VM is paused when it has
>> dirtied a pre-defined number of pages. Traversing a large, sparsely
>> populated bitmap to find set bits is time-consuming, as is copying the
>> bitmap to user-space.
>>
>> The preferred data structure for performant checkpoint implementations is
>> a dense list of guest frame numbers (GFN), which also plays well with PML.
>>
>> Signed-off-by: Lei Cao <lei.cao@xxxxxxxxxxx>
>> ---
>>  include/linux/kvm_host.h |  53 ++++++++
>>  virt/kvm/kvm_main.c      | 255 ++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 307 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 5276fe0..5793ecf 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -358,6 +358,54 @@ struct kvm_memslots {
>>  	int used_slots;
>>  };
>>
>> +struct gfn_list {
>> +	int max_dirty;
>> +	__u64 *dirty_gfns; /* slot / offset */
>> +	int dirty_index;
>> +	int fetch_index;
>> +	int reset_index;
>> +	int overflow;
>> +	spinlock_t lock;
>> +	struct mutex mtx;
>> +};
>> +
>> +struct vcpu_mt {
>> +	struct gfn_list gfn_list;
>> +};
>> +
>> +struct sublist_waiter {
>> +	wait_queue_head_t       wq;
>> +	spinlock_t              lock;
>> +	int goal;
>> +};
>> +
>> +#define MAX_DIRTY_PAGE_BATCH   (4096/sizeof(int))
> 
> 4096 -> PAGE_SIZE ?

Right. This define is no longer used, will remove.

> 
>> +
>> +struct kvm_mt {
>> +	long            cp_id;
>> +	int             active;
>> +	int             mode;
>> +	spinlock_t      lock;
>> +	u64             max_gfn;
>> +	int             quiesced;
>> +	int             allow_blocking;
>> +	void            *bmap;
>> +	long            bmapsz;
>> +	int             dirty_trigger;
>> +
>> +	struct gfn_list gfn_list;
>> +
>> +	int             tot_pages;
>> +	int             fetch_count;
>> +
>> +	struct mutex    sublist_mtx;
>> +	struct sublist_waiter sw;
>> +	int             sw_busy;
>> +	spinlock_t      sw_lock;
>> +
>> +	__u64           *gfn_buf;
>> +};
>> +
>>  struct kvm {
>>  	spinlock_t mmu_lock;
>>  	struct mutex slots_lock;
>> @@ -407,6 +455,11 @@ struct kvm {
>>  #endif
>>  	long tlbs_dirty;
>>  	struct list_head devices;
>> +
>> +
>> +	struct kvm_mt mt;
>> +
>> +	struct vcpu_mt vcpu_mt[KVM_MAX_VCPUS];
> 
> Why not make it embedded to kvm_vcpu?

Good idea. Will do.

> 
>>  };
>>
>>  #define kvm_err(fmt, ...) \
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 8a582e5..fe46067 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -220,10 +220,48 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
>>  	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
>>  }
>>
>> +static void init_gfn_list(struct gfn_list *gfnlist)
>> +{
>> +	gfnlist->dirty_gfns = NULL;
>> +	gfnlist->dirty_index = 0;
>> +	gfnlist->fetch_index = 0;
>> +	gfnlist->reset_index = 0;
>> +	gfnlist->overflow = 0;
>> +}
>> +
>> +static void clear_gfn_list(struct gfn_list *gfnlist)
>> +{
>> +	init_gfn_list(gfnlist);
>> +	gfnlist->max_dirty = 0;
>> +}
>> +
>> +static void clear_kvm_mt(struct kvm *kvm)
>> +{
>> +	clear_gfn_list(&kvm->mt.gfn_list);
>> +	kvm->mt.active = 0;
>> +	kvm->mt.mode = 0;
>> +	kvm->mt.tot_pages  = kvm->mt.fetch_count = 0;
>> +}
>> +
>> +static void clear_vcpu_mt(struct kvm_vcpu *vcpu)
>> +{
>> +	struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +
>> +	clear_gfn_list(gfnlist);
>> +}
>> +
>> +static void init_vcpu_mt(struct kvm_vcpu *vcpu)
>> +{
>> +	struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +
>> +	init_gfn_list(gfnlist);
>> +}
>> +
>>  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>>  {
>>  	struct page *page;
>>  	int r;
>> +	struct gfn_list *gfnlist;
>>
>>  	mutex_init(&vcpu->mutex);
>>  	vcpu->cpu = -1;
>> @@ -236,6 +274,11 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>>  	vcpu->pre_pcpu = -1;
>>  	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
>>
>> +	gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +	clear_vcpu_mt(vcpu);
>> +	spin_lock_init(&gfnlist->lock);
>> +	mutex_init(&gfnlist->mtx);
>> +
>>  	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
>>  	if (!page) {
>>  		r = -ENOMEM;
>> @@ -602,6 +645,16 @@ static struct kvm *kvm_create_vm(unsigned long type)
>>
>>  	preempt_notifier_inc();
>>
>> +	clear_kvm_mt(kvm);
>> +	spin_lock_init(&kvm->mt.lock);
>> +	spin_lock_init(&kvm->mt.sw_lock);
>> +	spin_lock_init(&kvm->mt.gfn_list.lock);
>> +	mutex_init(&kvm->mt.gfn_list.mtx);
>> +	init_waitqueue_head(&kvm->mt.sw.wq);
>> +	spin_lock_init(&kvm->mt.sw.lock);
>> +	kvm->mt.sw.goal = 0;
>> +	mutex_init(&kvm->mt.sublist_mtx);
>> +
>>  	return kvm;
>>
>>  out_err:
>> @@ -642,11 +695,14 @@ static void kvm_destroy_devices(struct kvm *kvm)
>>  	}
>>  }
>>
>> +static int kvm_cleanup_mt(struct kvm *kvm, int force);
>> +
>>  static void kvm_destroy_vm(struct kvm *kvm)
>>  {
>>  	int i;
>>  	struct mm_struct *mm = kvm->mm;
>>
>> +	kvm_cleanup_mt(kvm, 1);
>>  	kvm_arch_sync_events(kvm);
>>  	spin_lock(&kvm_lock);
>>  	list_del(&kvm->vm_list);
>> @@ -2752,9 +2808,206 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>>  	return kvm_vm_ioctl_check_extension(kvm, arg);
>>  }
>>
>> +static u64 kvm_get_max_gfn(struct kvm *kvm)
>> +{
>> +	int num_gfn = -1;
>> +	struct kvm_memslots *slots = kvm_memslots(kvm);
>> +	struct kvm_memory_slot *memslot;
>> +	int gfn;
>> +
>> +	kvm_for_each_memslot(memslot, slots) {
>> +		gfn = memslot->base_gfn + memslot->npages;
>> +		if (gfn > num_gfn)
>> +			num_gfn = gfn;
>> +	}
>> +	return num_gfn - 1;
>> +}
> 
> This might be wrong if there are devices assigned to the guest, as each 
> assigned resource also occupies a memory slot. I assume you don't want 
> to monitor assigned resources, do you?

max gfn is only used for sanity check. When we are told to mark a page dirty,
or to reset the write trap for a page, we make sure that the gfn is valid.

> 
>> +
>> +#define DIRTY_GFN_ADD_GRANULARITY      (256)
>> +
>> +/*
>> + * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal.
>> + */
>> +static int mt_calc_max_dirty(int goal)
>> +{
>> +	int val = DIRTY_GFN_ADD_GRANULARITY;
>> +
>> +	while (val < goal)
>> +		val += DIRTY_GFN_ADD_GRANULARITY;
>> +	return val;
>> +}
>> +
>> +static int __kvm_init_mt(struct kvm *kvm, struct mt_setup *mts)
>> +{
>> +	struct kvm_vcpu *vcpu;
>> +	int i, alloc_fail = 0;
>> +	struct gfn_list *gfnlist = &kvm->mt.gfn_list;
>> +	int max_dirty;
>> +
>> +	if (gfnlist->dirty_gfns) {
>> +		pr_warn("KVM: vm %d, MT already initialized\n",
>> +			current->pid);
>> +		return -EBUSY;
>> +	}
>> +
>> +	/*
>> +	 * Internally, max_dirty must be a multiple of DIRTY_GFN_ADD_GRANULARITY
>> +	 */
>> +	max_dirty = mt_calc_max_dirty(mts->max_dirty);
>> +
>> +	/* probably need to make gfn lists for each vcpu to avoid locking */
>> +	gfnlist->dirty_gfns = vmalloc(max_dirty *
>> +					sizeof(*(gfnlist->dirty_gfns)));
>> +	if (!gfnlist->dirty_gfns)
>> +		return -ENOMEM;
>> +
>> +	if (mts->flags & KVM_MT_OPTION_NO_DUPS) {
>> +		long bmapsz = sizeof(long) *
>> +			((kvm->mt.max_gfn + BITS_PER_LONG)/BITS_PER_LONG);
>> +		kvm->mt.bmap = vmalloc(bmapsz);
>> +		if (!kvm->mt.bmap) {
>> +			vfree(gfnlist->dirty_gfns);
>> +			gfnlist->dirty_gfns = NULL;
>> +			return -ENOMEM;
>> +		}
>> +		kvm->mt.bmapsz = bmapsz;
>> +		memset(kvm->mt.bmap, 0, bmapsz);
>> +		pr_info("KVM: vm %d, using bmap, size %ld\n",
>> +			current->pid, bmapsz);
>> +	}
>> +
>> +	gfnlist->max_dirty = max_dirty;
>> +	gfnlist->dirty_index = 0;
>> +
>> +	kvm->mt.active = 0;
>> +	kvm->mt.tot_pages = 0;
>> +	kvm->mt.fetch_count = 0;
>> +	kvm->mt.quiesced = 0;
>> +	kvm->mt.allow_blocking = 1;
>> +	kvm->mt.sw_busy = 0;
>> +	kvm->mt.gfn_buf = vmalloc(max_dirty * sizeof(*(kvm->mt.gfn_buf)));
>> +	if (!kvm->mt.gfn_buf) {
>> +		vfree(gfnlist->dirty_gfns);
>> +		gfnlist->dirty_gfns = NULL;
>> +		if (kvm->mt.bmap) {
>> +			vfree(kvm->mt.bmap);
>> +			kvm->mt.bmap = NULL;
>> +		}
>> +		return -ENOMEM;
>> +	}
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		init_vcpu_mt(vcpu);
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +		gfnlist->dirty_gfns =
>> +			vmalloc(max_dirty *
>> +				sizeof(*(gfnlist->dirty_gfns)));
>> +		if (!gfnlist->dirty_gfns) {
>> +			pr_warn("KVM: vm %d, MT init, alloc fail, vcpu %d\n",
>> +				current->pid, i);
>> +			alloc_fail = 1;
>> +			break;
>> +		}
>> +		gfnlist->max_dirty = max_dirty;
>> +	}
>> +
>> +	if (alloc_fail) {
>> +		/* cleanup and bail out */
>> +		kvm_for_each_vcpu(i, vcpu, kvm) {
>> +			gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +			if (gfnlist->dirty_gfns)
>> +				vfree(gfnlist->dirty_gfns);
>> +			clear_vcpu_mt(vcpu);
>> +		}
>> +
>> +		vfree(kvm->mt.gfn_list.dirty_gfns);
>> +		kvm->mt.gfn_list.dirty_gfns = NULL;
>> +		kvm->mt.bmap = NULL;
>> +		vfree(kvm->mt.gfn_buf);
>> +		kvm->mt.gfn_buf = NULL;
>> +		clear_kvm_mt(kvm);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int kvm_init_mt(struct kvm *kvm, struct mt_setup *mts)
>> +{
>> +	int rc = 0;
>> +
>> +	if (mts->version != KVM_MT_VERSION) {
>> +		pr_warn(
>> +			"KVM: vm %d, MT version error, kvm %d, qemu %d\n",
>> +			current->pid, KVM_MT_VERSION, mts->version);
>> +		rc = -EINVAL;
>> +		goto init_mt_done;
>> +	}
>> +
>> +	if (mts->max_dirty == 0) {
>> +		pr_warn("KVM: vm %d, MT max_dirty is zero?\n",
>> +			current->pid);
>> +		rc = -EINVAL;
>> +		goto init_mt_done;
>> +	}
>> +
>> +	kvm->mt.max_gfn = kvm_get_max_gfn(kvm);
>> +
>> +	rc = __kvm_init_mt(kvm, mts);
>> +
>> +init_mt_done:
>> +	return rc;
>> +}
>> +
>> +static int kvm_cleanup_mt(struct kvm *kvm, int force)
>> +{
>> +	int rc = 0;
>> +	int i;
>> +	struct kvm_vcpu *vcpu;
>> +
>> +	if (!force && kvm->mt.active) {
>> +		pr_warn("KVM: vm %d, MT still active!\n",
>> +			current->pid);
>> +		rc = -EBUSY;
>> +		goto cleanup_mt_done;
>> +	}
>> +
>> +	if (kvm->mt.gfn_list.dirty_gfns)
>> +		vfree(kvm->mt.gfn_list.dirty_gfns);
>> +
>> +	clear_kvm_mt(kvm);
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		int id = vcpu->vcpu_id;
>> +		struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[id].gfn_list;
>> +
>> +		if (gfnlist->dirty_gfns)
>> +			vfree(gfnlist->dirty_gfns);
>> +		clear_vcpu_mt(vcpu);
>> +	}
>> +
>> +	if (kvm->mt.bmap)
>> +		vfree(kvm->mt.bmap);
>> +	kvm->mt.bmap = NULL;
>> +	if (kvm->mt.gfn_buf)
>> +		vfree(kvm->mt.gfn_buf);
>> +	kvm->mt.gfn_buf = NULL;
>> +
>> +	kvm->mt.active = 0;
>> +
>> +cleanup_mt_done:
>> +
>> +	return rc;
>> +}
>> +
>>  static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts)
>>  {
>> -	return -EINVAL;
>> +	if (mts->op == KVM_MT_OP_INIT)
>> +		return kvm_init_mt(kvm, mts);
>> +	else if (mts->op == KVM_MT_OP_CLEANUP)
>> +		return kvm_cleanup_mt(kvm, 0);
>> +	else
>> +		return -EINVAL;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte)
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html