On 4/28/2016 5:05 AM, Huang, Kai wrote: > Hi, > > On 4/27/2016 7:23 AM, Cao, Lei wrote: >> Introduce memory tracking data structures and implement the new >> setup/cleanup ioctls. >> >> Chief among the KVM performance bottlenecks is the use of large >> (VM-sized) bitmaps to convey the information about which pages of memory >> are dirtied. These bitmaps are copied to user-space when QEMU queries >> KVM for its dirty page information. The use of bitmaps makes sense in the >> current live-migration method, as it is possible for all of memory to be >> dirty from one log-dirty pass to another. But in a checkpoint system, the >> number of dirty pages is bounded such that the VM is paused when it has >> dirtied a pre-defined number of pages. Traversing a large, sparsely >> populated bitmap to find set bits is time-consuming, as is copying the >> bitmap to user-space. >> >> The preferred data structure for performant checkpoint implementations is >> a dense list of guest frame numbers (GFN), which also plays well with PML. >> >> Signed-off-by: Lei Cao <lei.cao@xxxxxxxxxxx> >> --- >> include/linux/kvm_host.h | 53 ++++++++ >> virt/kvm/kvm_main.c | 255 ++++++++++++++++++++++++++++++++++++- >> 2 files changed, 307 insertions(+), 1 deletion(-) >> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >> index 5276fe0..5793ecf 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -358,6 +358,54 @@ struct kvm_memslots { >> int used_slots; >> }; >> >> +struct gfn_list { >> + int max_dirty; >> + __u64 *dirty_gfns; /* slot / offset */ >> + int dirty_index; >> + int fetch_index; >> + int reset_index; >> + int overflow; >> + spinlock_t lock; >> + struct mutex mtx; >> +}; >> + >> +struct vcpu_mt { >> + struct gfn_list gfn_list; >> +}; >> + >> +struct sublist_waiter { >> + wait_queue_head_t wq; >> + spinlock_t lock; >> + int goal; >> +}; >> + >> +#define MAX_DIRTY_PAGE_BATCH (4096/sizeof(int)) > > 4096 -> PAGE_SIZE ? Right. This define is no longer used, will remove. > >> + >> +struct kvm_mt { >> + long cp_id; >> + int active; >> + int mode; >> + spinlock_t lock; >> + u64 max_gfn; >> + int quiesced; >> + int allow_blocking; >> + void *bmap; >> + long bmapsz; >> + int dirty_trigger; >> + >> + struct gfn_list gfn_list; >> + >> + int tot_pages; >> + int fetch_count; >> + >> + struct mutex sublist_mtx; >> + struct sublist_waiter sw; >> + int sw_busy; >> + spinlock_t sw_lock; >> + >> + __u64 *gfn_buf; >> +}; >> + >> struct kvm { >> spinlock_t mmu_lock; >> struct mutex slots_lock; >> @@ -407,6 +455,11 @@ struct kvm { >> #endif >> long tlbs_dirty; >> struct list_head devices; >> + >> + >> + struct kvm_mt mt; >> + >> + struct vcpu_mt vcpu_mt[KVM_MAX_VCPUS]; > > Why not make it embedded to kvm_vcpu? Good idea. Will do. > >> }; >> >> #define kvm_err(fmt, ...) \ >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c >> index 8a582e5..fe46067 100644 >> --- a/virt/kvm/kvm_main.c >> +++ b/virt/kvm/kvm_main.c >> @@ -220,10 +220,48 @@ void kvm_reload_remote_mmus(struct kvm *kvm) >> kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); >> } >> >> +static void init_gfn_list(struct gfn_list *gfnlist) >> +{ >> + gfnlist->dirty_gfns = NULL; >> + gfnlist->dirty_index = 0; >> + gfnlist->fetch_index = 0; >> + gfnlist->reset_index = 0; >> + gfnlist->overflow = 0; >> +} >> + >> +static void clear_gfn_list(struct gfn_list *gfnlist) >> +{ >> + init_gfn_list(gfnlist); >> + gfnlist->max_dirty = 0; >> +} >> + >> +static void clear_kvm_mt(struct kvm *kvm) >> +{ >> + clear_gfn_list(&kvm->mt.gfn_list); >> + kvm->mt.active = 0; >> + kvm->mt.mode = 0; >> + kvm->mt.tot_pages = kvm->mt.fetch_count = 0; >> +} >> + >> +static void clear_vcpu_mt(struct kvm_vcpu *vcpu) >> +{ >> + struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; >> + >> + clear_gfn_list(gfnlist); >> +} >> + >> +static void init_vcpu_mt(struct kvm_vcpu *vcpu) >> +{ >> + struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; >> + >> + init_gfn_list(gfnlist); >> +} >> + >> int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) >> { >> struct page *page; >> int r; >> + struct gfn_list *gfnlist; >> >> mutex_init(&vcpu->mutex); >> vcpu->cpu = -1; >> @@ -236,6 +274,11 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) >> vcpu->pre_pcpu = -1; >> INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); >> >> + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; >> + clear_vcpu_mt(vcpu); >> + spin_lock_init(&gfnlist->lock); >> + mutex_init(&gfnlist->mtx); >> + >> page = alloc_page(GFP_KERNEL | __GFP_ZERO); >> if (!page) { >> r = -ENOMEM; >> @@ -602,6 +645,16 @@ static struct kvm *kvm_create_vm(unsigned long type) >> >> preempt_notifier_inc(); >> >> + clear_kvm_mt(kvm); >> + spin_lock_init(&kvm->mt.lock); >> + spin_lock_init(&kvm->mt.sw_lock); >> + spin_lock_init(&kvm->mt.gfn_list.lock); >> + mutex_init(&kvm->mt.gfn_list.mtx); >> + init_waitqueue_head(&kvm->mt.sw.wq); >> + spin_lock_init(&kvm->mt.sw.lock); >> + kvm->mt.sw.goal = 0; >> + mutex_init(&kvm->mt.sublist_mtx); >> + >> return kvm; >> >> out_err: >> @@ -642,11 +695,14 @@ static void kvm_destroy_devices(struct kvm *kvm) >> } >> } >> >> +static int kvm_cleanup_mt(struct kvm *kvm, int force); >> + >> static void kvm_destroy_vm(struct kvm *kvm) >> { >> int i; >> struct mm_struct *mm = kvm->mm; >> >> + kvm_cleanup_mt(kvm, 1); >> kvm_arch_sync_events(kvm); >> spin_lock(&kvm_lock); >> list_del(&kvm->vm_list); >> @@ -2752,9 +2808,206 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) >> return kvm_vm_ioctl_check_extension(kvm, arg); >> } >> >> +static u64 kvm_get_max_gfn(struct kvm *kvm) >> +{ >> + int num_gfn = -1; >> + struct kvm_memslots *slots = kvm_memslots(kvm); >> + struct kvm_memory_slot *memslot; >> + int gfn; >> + >> + kvm_for_each_memslot(memslot, slots) { >> + gfn = memslot->base_gfn + memslot->npages; >> + if (gfn > num_gfn) >> + num_gfn = gfn; >> + } >> + return num_gfn - 1; >> +} > > This might be wrong if there are devices assigned to the guest, as each > assigned resource also occupies a memory slot. I assume you don't want > to monitor assigned resources, do you? max gfn is only used for sanity check. When we are told to mark a page dirty, or to reset the write trap for a page, we make sure that the gfn is valid. > >> + >> +#define DIRTY_GFN_ADD_GRANULARITY (256) >> + >> +/* >> + * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal. >> + */ >> +static int mt_calc_max_dirty(int goal) >> +{ >> + int val = DIRTY_GFN_ADD_GRANULARITY; >> + >> + while (val < goal) >> + val += DIRTY_GFN_ADD_GRANULARITY; >> + return val; >> +} >> + >> +static int __kvm_init_mt(struct kvm *kvm, struct mt_setup *mts) >> +{ >> + struct kvm_vcpu *vcpu; >> + int i, alloc_fail = 0; >> + struct gfn_list *gfnlist = &kvm->mt.gfn_list; >> + int max_dirty; >> + >> + if (gfnlist->dirty_gfns) { >> + pr_warn("KVM: vm %d, MT already initialized\n", >> + current->pid); >> + return -EBUSY; >> + } >> + >> + /* >> + * Internally, max_dirty must be a multiple of DIRTY_GFN_ADD_GRANULARITY >> + */ >> + max_dirty = mt_calc_max_dirty(mts->max_dirty); >> + >> + /* probably need to make gfn lists for each vcpu to avoid locking */ >> + gfnlist->dirty_gfns = vmalloc(max_dirty * >> + sizeof(*(gfnlist->dirty_gfns))); >> + if (!gfnlist->dirty_gfns) >> + return -ENOMEM; >> + >> + if (mts->flags & KVM_MT_OPTION_NO_DUPS) { >> + long bmapsz = sizeof(long) * >> + ((kvm->mt.max_gfn + BITS_PER_LONG)/BITS_PER_LONG); >> + kvm->mt.bmap = vmalloc(bmapsz); >> + if (!kvm->mt.bmap) { >> + vfree(gfnlist->dirty_gfns); >> + gfnlist->dirty_gfns = NULL; >> + return -ENOMEM; >> + } >> + kvm->mt.bmapsz = bmapsz; >> + memset(kvm->mt.bmap, 0, bmapsz); >> + pr_info("KVM: vm %d, using bmap, size %ld\n", >> + current->pid, bmapsz); >> + } >> + >> + gfnlist->max_dirty = max_dirty; >> + gfnlist->dirty_index = 0; >> + >> + kvm->mt.active = 0; >> + kvm->mt.tot_pages = 0; >> + kvm->mt.fetch_count = 0; >> + kvm->mt.quiesced = 0; >> + kvm->mt.allow_blocking = 1; >> + kvm->mt.sw_busy = 0; >> + kvm->mt.gfn_buf = vmalloc(max_dirty * sizeof(*(kvm->mt.gfn_buf))); >> + if (!kvm->mt.gfn_buf) { >> + vfree(gfnlist->dirty_gfns); >> + gfnlist->dirty_gfns = NULL; >> + if (kvm->mt.bmap) { >> + vfree(kvm->mt.bmap); >> + kvm->mt.bmap = NULL; >> + } >> + return -ENOMEM; >> + } >> + >> + kvm_for_each_vcpu(i, vcpu, kvm) { >> + init_vcpu_mt(vcpu); >> + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; >> + gfnlist->dirty_gfns = >> + vmalloc(max_dirty * >> + sizeof(*(gfnlist->dirty_gfns))); >> + if (!gfnlist->dirty_gfns) { >> + pr_warn("KVM: vm %d, MT init, alloc fail, vcpu %d\n", >> + current->pid, i); >> + alloc_fail = 1; >> + break; >> + } >> + gfnlist->max_dirty = max_dirty; >> + } >> + >> + if (alloc_fail) { >> + /* cleanup and bail out */ >> + kvm_for_each_vcpu(i, vcpu, kvm) { >> + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; >> + if (gfnlist->dirty_gfns) >> + vfree(gfnlist->dirty_gfns); >> + clear_vcpu_mt(vcpu); >> + } >> + >> + vfree(kvm->mt.gfn_list.dirty_gfns); >> + kvm->mt.gfn_list.dirty_gfns = NULL; >> + kvm->mt.bmap = NULL; >> + vfree(kvm->mt.gfn_buf); >> + kvm->mt.gfn_buf = NULL; >> + clear_kvm_mt(kvm); >> + return -ENOMEM; >> + } >> + >> + return 0; >> +} >> + >> +static int kvm_init_mt(struct kvm *kvm, struct mt_setup *mts) >> +{ >> + int rc = 0; >> + >> + if (mts->version != KVM_MT_VERSION) { >> + pr_warn( >> + "KVM: vm %d, MT version error, kvm %d, qemu %d\n", >> + current->pid, KVM_MT_VERSION, mts->version); >> + rc = -EINVAL; >> + goto init_mt_done; >> + } >> + >> + if (mts->max_dirty == 0) { >> + pr_warn("KVM: vm %d, MT max_dirty is zero?\n", >> + current->pid); >> + rc = -EINVAL; >> + goto init_mt_done; >> + } >> + >> + kvm->mt.max_gfn = kvm_get_max_gfn(kvm); >> + >> + rc = __kvm_init_mt(kvm, mts); >> + >> +init_mt_done: >> + return rc; >> +} >> + >> +static int kvm_cleanup_mt(struct kvm *kvm, int force) >> +{ >> + int rc = 0; >> + int i; >> + struct kvm_vcpu *vcpu; >> + >> + if (!force && kvm->mt.active) { >> + pr_warn("KVM: vm %d, MT still active!\n", >> + current->pid); >> + rc = -EBUSY; >> + goto cleanup_mt_done; >> + } >> + >> + if (kvm->mt.gfn_list.dirty_gfns) >> + vfree(kvm->mt.gfn_list.dirty_gfns); >> + >> + clear_kvm_mt(kvm); >> + >> + kvm_for_each_vcpu(i, vcpu, kvm) { >> + int id = vcpu->vcpu_id; >> + struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[id].gfn_list; >> + >> + if (gfnlist->dirty_gfns) >> + vfree(gfnlist->dirty_gfns); >> + clear_vcpu_mt(vcpu); >> + } >> + >> + if (kvm->mt.bmap) >> + vfree(kvm->mt.bmap); >> + kvm->mt.bmap = NULL; >> + if (kvm->mt.gfn_buf) >> + vfree(kvm->mt.gfn_buf); >> + kvm->mt.gfn_buf = NULL; >> + >> + kvm->mt.active = 0; >> + >> +cleanup_mt_done: >> + >> + return rc; >> +} >> + >> static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts) >> { >> - return -EINVAL; >> + if (mts->op == KVM_MT_OP_INIT) >> + return kvm_init_mt(kvm, mts); >> + else if (mts->op == KVM_MT_OP_CLEANUP) >> + return kvm_cleanup_mt(kvm, 0); >> + else >> + return -EINVAL; >> } >> >> static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte) >> > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html