Introduce memory tracking data structures and implement the new setup/cleanup ioctls. Chief among the KVM performance bottlenecks is the use of large (VM-sized) bitmaps to convey the information about which pages of memory are dirtied. These bitmaps are copied to user-space when QEMU queries KVM for its dirty page information. The use of bitmaps makes sense in the current live-migration method, as it is possible for all of memory to be dirty from one log-dirty pass to another. But in a checkpoint system, the number of dirty pages is bounded such that the VM is paused when it has dirtied a pre-defined number of pages. Traversing a large, sparsely populated bitmap to find set bits is time-consuming, as is copying the bitmap to user-space. The preferred data structure for performant checkpoint implementations is a dense list of guest frame numbers (GFN), which also plays well with PML. Signed-off-by: Lei Cao <lei.cao@xxxxxxxxxxx> --- include/linux/kvm_host.h | 53 ++++++++ virt/kvm/kvm_main.c | 255 ++++++++++++++++++++++++++++++++++++- 2 files changed, 307 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5276fe0..5793ecf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -358,6 +358,54 @@ struct kvm_memslots { int used_slots; }; +struct gfn_list { + int max_dirty; + __u64 *dirty_gfns; /* slot / offset */ + int dirty_index; + int fetch_index; + int reset_index; + int overflow; + spinlock_t lock; + struct mutex mtx; +}; + +struct vcpu_mt { + struct gfn_list gfn_list; +}; + +struct sublist_waiter { + wait_queue_head_t wq; + spinlock_t lock; + int goal; +}; + +#define MAX_DIRTY_PAGE_BATCH (4096/sizeof(int)) + +struct kvm_mt { + long cp_id; + int active; + int mode; + spinlock_t lock; + u64 max_gfn; + int quiesced; + int allow_blocking; + void *bmap; + long bmapsz; + int dirty_trigger; + + struct gfn_list gfn_list; + + int tot_pages; + int fetch_count; + + struct mutex sublist_mtx; + struct sublist_waiter sw; + int sw_busy; + spinlock_t sw_lock; + + __u64 *gfn_buf; +}; + struct kvm { spinlock_t mmu_lock; struct mutex slots_lock; @@ -407,6 +455,11 @@ struct kvm { #endif long tlbs_dirty; struct list_head devices; + + + struct kvm_mt mt; + + struct vcpu_mt vcpu_mt[KVM_MAX_VCPUS]; }; #define kvm_err(fmt, ...) \ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8a582e5..fe46067 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -220,10 +220,48 @@ void kvm_reload_remote_mmus(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } +static void init_gfn_list(struct gfn_list *gfnlist) +{ + gfnlist->dirty_gfns = NULL; + gfnlist->dirty_index = 0; + gfnlist->fetch_index = 0; + gfnlist->reset_index = 0; + gfnlist->overflow = 0; +} + +static void clear_gfn_list(struct gfn_list *gfnlist) +{ + init_gfn_list(gfnlist); + gfnlist->max_dirty = 0; +} + +static void clear_kvm_mt(struct kvm *kvm) +{ + clear_gfn_list(&kvm->mt.gfn_list); + kvm->mt.active = 0; + kvm->mt.mode = 0; + kvm->mt.tot_pages = kvm->mt.fetch_count = 0; +} + +static void clear_vcpu_mt(struct kvm_vcpu *vcpu) +{ + struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; + + clear_gfn_list(gfnlist); +} + +static void init_vcpu_mt(struct kvm_vcpu *vcpu) +{ + struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; + + init_gfn_list(gfnlist); +} + int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { struct page *page; int r; + struct gfn_list *gfnlist; mutex_init(&vcpu->mutex); vcpu->cpu = -1; @@ -236,6 +274,11 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->pre_pcpu = -1; INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; + clear_vcpu_mt(vcpu); + spin_lock_init(&gfnlist->lock); + mutex_init(&gfnlist->mtx); + page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { r = -ENOMEM; @@ -602,6 +645,16 @@ static struct kvm *kvm_create_vm(unsigned long type) preempt_notifier_inc(); + clear_kvm_mt(kvm); + spin_lock_init(&kvm->mt.lock); + spin_lock_init(&kvm->mt.sw_lock); + spin_lock_init(&kvm->mt.gfn_list.lock); + mutex_init(&kvm->mt.gfn_list.mtx); + init_waitqueue_head(&kvm->mt.sw.wq); + spin_lock_init(&kvm->mt.sw.lock); + kvm->mt.sw.goal = 0; + mutex_init(&kvm->mt.sublist_mtx); + return kvm; out_err: @@ -642,11 +695,14 @@ static void kvm_destroy_devices(struct kvm *kvm) } } +static int kvm_cleanup_mt(struct kvm *kvm, int force); + static void kvm_destroy_vm(struct kvm *kvm) { int i; struct mm_struct *mm = kvm->mm; + kvm_cleanup_mt(kvm, 1); kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); list_del(&kvm->vm_list); @@ -2752,9 +2808,206 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return kvm_vm_ioctl_check_extension(kvm, arg); } +static u64 kvm_get_max_gfn(struct kvm *kvm) +{ + int num_gfn = -1; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *memslot; + int gfn; + + kvm_for_each_memslot(memslot, slots) { + gfn = memslot->base_gfn + memslot->npages; + if (gfn > num_gfn) + num_gfn = gfn; + } + return num_gfn - 1; +} + +#define DIRTY_GFN_ADD_GRANULARITY (256) + +/* + * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal. + */ +static int mt_calc_max_dirty(int goal) +{ + int val = DIRTY_GFN_ADD_GRANULARITY; + + while (val < goal) + val += DIRTY_GFN_ADD_GRANULARITY; + return val; +} + +static int __kvm_init_mt(struct kvm *kvm, struct mt_setup *mts) +{ + struct kvm_vcpu *vcpu; + int i, alloc_fail = 0; + struct gfn_list *gfnlist = &kvm->mt.gfn_list; + int max_dirty; + + if (gfnlist->dirty_gfns) { + pr_warn("KVM: vm %d, MT already initialized\n", + current->pid); + return -EBUSY; + } + + /* + * Internally, max_dirty must be a multiple of DIRTY_GFN_ADD_GRANULARITY + */ + max_dirty = mt_calc_max_dirty(mts->max_dirty); + + /* probably need to make gfn lists for each vcpu to avoid locking */ + gfnlist->dirty_gfns = vmalloc(max_dirty * + sizeof(*(gfnlist->dirty_gfns))); + if (!gfnlist->dirty_gfns) + return -ENOMEM; + + if (mts->flags & KVM_MT_OPTION_NO_DUPS) { + long bmapsz = sizeof(long) * + ((kvm->mt.max_gfn + BITS_PER_LONG)/BITS_PER_LONG); + kvm->mt.bmap = vmalloc(bmapsz); + if (!kvm->mt.bmap) { + vfree(gfnlist->dirty_gfns); + gfnlist->dirty_gfns = NULL; + return -ENOMEM; + } + kvm->mt.bmapsz = bmapsz; + memset(kvm->mt.bmap, 0, bmapsz); + pr_info("KVM: vm %d, using bmap, size %ld\n", + current->pid, bmapsz); + } + + gfnlist->max_dirty = max_dirty; + gfnlist->dirty_index = 0; + + kvm->mt.active = 0; + kvm->mt.tot_pages = 0; + kvm->mt.fetch_count = 0; + kvm->mt.quiesced = 0; + kvm->mt.allow_blocking = 1; + kvm->mt.sw_busy = 0; + kvm->mt.gfn_buf = vmalloc(max_dirty * sizeof(*(kvm->mt.gfn_buf))); + if (!kvm->mt.gfn_buf) { + vfree(gfnlist->dirty_gfns); + gfnlist->dirty_gfns = NULL; + if (kvm->mt.bmap) { + vfree(kvm->mt.bmap); + kvm->mt.bmap = NULL; + } + return -ENOMEM; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + init_vcpu_mt(vcpu); + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; + gfnlist->dirty_gfns = + vmalloc(max_dirty * + sizeof(*(gfnlist->dirty_gfns))); + if (!gfnlist->dirty_gfns) { + pr_warn("KVM: vm %d, MT init, alloc fail, vcpu %d\n", + current->pid, i); + alloc_fail = 1; + break; + } + gfnlist->max_dirty = max_dirty; + } + + if (alloc_fail) { + /* cleanup and bail out */ + kvm_for_each_vcpu(i, vcpu, kvm) { + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; + if (gfnlist->dirty_gfns) + vfree(gfnlist->dirty_gfns); + clear_vcpu_mt(vcpu); + } + + vfree(kvm->mt.gfn_list.dirty_gfns); + kvm->mt.gfn_list.dirty_gfns = NULL; + kvm->mt.bmap = NULL; + vfree(kvm->mt.gfn_buf); + kvm->mt.gfn_buf = NULL; + clear_kvm_mt(kvm); + return -ENOMEM; + } + + return 0; +} + +static int kvm_init_mt(struct kvm *kvm, struct mt_setup *mts) +{ + int rc = 0; + + if (mts->version != KVM_MT_VERSION) { + pr_warn( + "KVM: vm %d, MT version error, kvm %d, qemu %d\n", + current->pid, KVM_MT_VERSION, mts->version); + rc = -EINVAL; + goto init_mt_done; + } + + if (mts->max_dirty == 0) { + pr_warn("KVM: vm %d, MT max_dirty is zero?\n", + current->pid); + rc = -EINVAL; + goto init_mt_done; + } + + kvm->mt.max_gfn = kvm_get_max_gfn(kvm); + + rc = __kvm_init_mt(kvm, mts); + +init_mt_done: + return rc; +} + +static int kvm_cleanup_mt(struct kvm *kvm, int force) +{ + int rc = 0; + int i; + struct kvm_vcpu *vcpu; + + if (!force && kvm->mt.active) { + pr_warn("KVM: vm %d, MT still active!\n", + current->pid); + rc = -EBUSY; + goto cleanup_mt_done; + } + + if (kvm->mt.gfn_list.dirty_gfns) + vfree(kvm->mt.gfn_list.dirty_gfns); + + clear_kvm_mt(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + int id = vcpu->vcpu_id; + struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[id].gfn_list; + + if (gfnlist->dirty_gfns) + vfree(gfnlist->dirty_gfns); + clear_vcpu_mt(vcpu); + } + + if (kvm->mt.bmap) + vfree(kvm->mt.bmap); + kvm->mt.bmap = NULL; + if (kvm->mt.gfn_buf) + vfree(kvm->mt.gfn_buf); + kvm->mt.gfn_buf = NULL; + + kvm->mt.active = 0; + +cleanup_mt_done: + + return rc; +} + static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts) { - return -EINVAL; + if (mts->op == KVM_MT_OP_INIT) + return kvm_init_mt(kvm, mts); + else if (mts->op == KVM_MT_OP_CLEANUP) + return kvm_cleanup_mt(kvm, 0); + else + return -EINVAL; } static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte) -- 2.5.0 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html