Implement the remaining memory tracking API. Signed-off-by: Lei Cao <lei.cao@xxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 5 + arch/x86/kvm/mmu.c | 93 +++++ include/uapi/linux/kvm.h | 4 +- virt/kvm/kvm_main.c | 610 +++++++++++++++++++++++++++++- 4 files changed, 699 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b7e3944..52bff2b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1030,6 +1030,11 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); +void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm); +void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm); +int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset); +gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset); + unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1ff4dbb..a36475a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1443,6 +1443,58 @@ restart: return 0; } +static struct kvm_memory_slot *kvm_memslot_from_id(struct kvm *kvm, int slot_id) +{ + int i; + struct kvm_memory_slot *memslot; + struct kvm_memslots *slots; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + if (memslot->id == slot_id) + return memslot; + } + } + return NULL; +} + +gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset) +{ + struct kvm_memory_slot *slot; + int slot_id; + gfn_t offset; + + slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset); + slot = kvm_memslot_from_id(kvm, slot_id); + if (slot == NULL) { + pr_warn("KVM: bad slot_id %d\n", slot_id); + return kvm->mt.max_gfn+1; + } + offset = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset); + return offset + slot->base_gfn; +} + +int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset) +{ + struct kvm_memory_slot *slot; + int slot_id; + gfn_t offset, gfn; + + slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset); + slot = kvm_memslot_from_id(kvm, slot_id); + offset = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset); + gfn = offset + slot->base_gfn; + + if (gfn > kvm->mt.max_gfn) { + pr_warn("KVM: bad gfn %lx\n", (long)gfn); + return 0; + } + + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, slot, offset, 1); + return 1; +} + struct slot_rmap_walk_iterator { /* input fields. */ struct kvm_memory_slot *slot; @@ -4762,6 +4814,47 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } +void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm) +{ + int i; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + + kvm_for_each_memslot(memslot, slots) { + if (memslot->id < KVM_USER_MEM_SLOTS) { + if (kvm_x86_ops->slot_enable_log_dirty) + kvm_x86_ops->slot_enable_log_dirty(kvm, + memslot); + else + kvm_mmu_slot_remove_write_access(kvm, + memslot); + } + } + } +} + +void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm) +{ + int i; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + + kvm_for_each_memslot(memslot, slots) { + if (memslot->id < KVM_USER_MEM_SLOTS) { + if (kvm_x86_ops->slot_disable_log_dirty) + kvm_x86_ops->slot_disable_log_dirty(kvm, + memslot); + } + } + } +} + static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2bce4db..736668d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1344,11 +1344,11 @@ struct mt_enable { #define MT_OFFSET_MASK (0x0000ffffffffffffUL) #define MT_MAKE_SLOT_OFFSET(slot, offset) \ - do { \ + ({ \ __u64 slot_off = offset & MT_OFFSET_MASK; \ slot_off |= ((__u64)slot << 48); \ slot_off; \ - } while (0) + }) #define MT_OFFSET_FROM_SLOT_OFFSET(slot_off) \ (slot_off & MT_OFFSET_MASK) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fe46067..ba99cbc6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1795,8 +1795,12 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); -static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, - const void *data, int offset, int len) +static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, struct kvm_vcpu *vcpu); + +static int __kvm_write_guest_page(struct kvm *kvm, + struct kvm_memory_slot *memslot, gfn_t gfn, + const void *data, int offset, int len) { int r; unsigned long addr; @@ -1808,6 +1812,8 @@ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, if (r) return -EFAULT; mark_page_dirty_in_slot(memslot, gfn); + if (memslot && (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)) + mt_mark_page_dirty(kvm, memslot, gfn, NULL); return 0; } @@ -1816,7 +1822,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - return __kvm_write_guest_page(slot, gfn, data, offset, len); + return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_write_guest_page); @@ -1825,7 +1831,7 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __kvm_write_guest_page(slot, gfn, data, offset, len); + return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); @@ -1929,6 +1935,10 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, if (r) return -EFAULT; mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT); + if (ghc->memslot && (ghc->memslot->id >= 0 && + ghc->memslot->id < KVM_USER_MEM_SLOTS)) + mt_mark_page_dirty(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT, + NULL); return 0; } @@ -1996,11 +2006,95 @@ static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, } } +/* + * We have some new dirty pages for our sublist waiter. Enough to merit + * waking it up? + */ +static void mt_sw_add_pages(struct kvm *kvm) +{ + int avail = kvm->mt.tot_pages - kvm->mt.fetch_count; + struct sublist_waiter *swp = &kvm->mt.sw; + + spin_lock(&kvm->mt.sw_lock); + + if (swp->goal && (avail >= swp->goal)) { + kvm->mt.fetch_count += avail; + swp->goal = 0; + wake_up(&swp->wq); + } + + spin_unlock(&kvm->mt.sw_lock); +} + +#define DIRTY_GFN_ADD_GRANULARITY (256) + +static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, struct kvm_vcpu *vcpu) +{ + int use_kvm; /* add to global list? */ + struct gfn_list *gfnlist; + int slot_id = slot->id; + __u64 offset = gfn - slot->base_gfn; + __u64 slot_offset; + + /* + * Try to add dirty page to vcpu list. If vcpu is NULL or + * vcpu list is full, then try to add to kvm master list. + */ + + if (!kvm->mt.active) + return; + + if (slot->id >= KVM_USER_MEM_SLOTS) + return; + + if (gfn > kvm->mt.max_gfn) + return; + + /* if we're avoiding duplicates, is this one already marked? */ + if (kvm->mt.bmap && test_and_set_bit(gfn, kvm->mt.bmap)) + return; + + slot_offset = MT_MAKE_SLOT_OFFSET(slot_id, offset); + + use_kvm = (vcpu == NULL); + + if (vcpu) { + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; + if (gfnlist->dirty_index == gfnlist->max_dirty) { + use_kvm = 1; + gfnlist->overflow = 1; + /* Fall back to master gfn list.*/ + gfnlist = &kvm->mt.gfn_list; + } + } else { + gfnlist = &kvm->mt.gfn_list; + } + + spin_lock(&gfnlist->lock); + if (gfnlist->dirty_index >= gfnlist->max_dirty) { + gfnlist->overflow = 1; + } else { + gfnlist->dirty_gfns[gfnlist->dirty_index++] = slot_offset; + if ((gfnlist->dirty_index % DIRTY_GFN_ADD_GRANULARITY) == 0) { + spin_lock(&kvm->mt.lock); + kvm->mt.tot_pages += DIRTY_GFN_ADD_GRANULARITY; + mt_sw_add_pages(kvm); + spin_unlock(&kvm->mt.lock); + } + } + spin_unlock(&gfnlist->lock); +} + void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot; memslot = gfn_to_memslot(kvm, gfn); + if (memslot) { + if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS) + mt_mark_page_dirty(kvm, memslot, gfn, NULL); + } mark_page_dirty_in_slot(memslot, gfn); } EXPORT_SYMBOL_GPL(mark_page_dirty); @@ -2010,6 +2104,10 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) struct kvm_memory_slot *memslot; memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (memslot) { + if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS) + mt_mark_page_dirty(vcpu->kvm, memslot, gfn, vcpu); + } mark_page_dirty_in_slot(memslot, gfn); } EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); @@ -2823,8 +2921,6 @@ static u64 kvm_get_max_gfn(struct kvm *kvm) return num_gfn - 1; } -#define DIRTY_GFN_ADD_GRANULARITY (256) - /* * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal. */ @@ -3010,31 +3106,523 @@ static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts) return -EINVAL; } +static int kvm_enable_mt(struct kvm *kvm) +{ + int rc = 0; + + if (kvm->mt.active) { + pr_warn("KVM: vm %d, MT already active\n", + current->pid); + rc = -EINVAL; + goto enable_mt_done; + } + + kvm_mmu_mt_enable_log_dirty(kvm); + if (kvm->mt.bmap) + memset(kvm->mt.bmap, 0, kvm->mt.bmapsz); + + kvm->mt.active = 1; + +enable_mt_done: + + return rc; +} + +static int kvm_disable_mt(struct kvm *kvm) +{ + int rc = 0; + + if (!kvm->mt.active) { + pr_warn("KVM: vm %d, MT already disabled\n", + current->pid); + rc = -EINVAL; + goto disable_mt_done; + } + + kvm_mmu_mt_disable_log_dirty(kvm); + kvm->mt.active = 0; + +disable_mt_done: + + return rc; +} + static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte) { - return -EINVAL; + if ((mte->flags & 0x1) == 1) + return kvm_enable_mt(kvm); + else if ((mte->flags & 0x1) == 0) + return kvm_disable_mt(kvm); + else + return -EINVAL; } static int kvm_vm_ioctl_mt_prepare_cp(struct kvm *kvm, struct mt_prepare_cp *mtpcp) { - return -EINVAL; + int i; + struct kvm_vcpu *vcpu; + struct gfn_list *gfnlist; + + if (!kvm->mt.active) + return -EINVAL; + + kvm->mt.cp_id = mtpcp->cpid; + + kvm_for_each_vcpu(i, vcpu, kvm) { + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; + spin_lock(&gfnlist->lock); + gfnlist->fetch_index = 0; + gfnlist->reset_index = 0; + gfnlist->dirty_index = 0; + gfnlist->overflow = 0; + spin_unlock(&gfnlist->lock); + } + + gfnlist = &kvm->mt.gfn_list; + spin_lock(&gfnlist->lock); + gfnlist->fetch_index = 0; + gfnlist->reset_index = 0; + gfnlist->dirty_index = 0; + gfnlist->overflow = 0; + spin_unlock(&gfnlist->lock); + + kvm->mt.quiesced = 0; + kvm->mt.allow_blocking = 1; + kvm->mt.tot_pages = kvm->mt.fetch_count = 0; + + return 0; +} + +static bool mt_reset_gfn(struct kvm *kvm, u64 slot_offset) +{ + gfn_t gfn; + + gfn = kvm_mt_slot_offset_to_gfn(kvm, slot_offset); + if (gfn > kvm->mt.max_gfn) + return 0; + + if (kvm->mt.bmap) { + if (kvm->mt.quiesced) { + /* + * Goal is to reset entire bmap, but don't need + * atomics if we are quiesced + */ + int offset32 = gfn/32; + int *p = (int *)(kvm->mt.bmap) + offset32; + *p = 0; + } else { + clear_bit(gfn, kvm->mt.bmap); + } + } + + return kvm_mt_mmu_reset_gfn(kvm, slot_offset); +} + +#define GFN_RESET_BATCH (64) + +static int mt_reset_all_gfns(struct kvm *kvm) +{ + int i, j; + struct kvm_vcpu *vcpu; + struct gfn_list *gfnlist; + bool cleared = false; + int reset_start, count, avail; + + if (!kvm->mt.active) + return -EINVAL; + + if (!kvm->mt.quiesced) + return -EINVAL; + + spin_lock(&kvm->mmu_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; + +vcpu_gfn_loop: + + spin_lock(&gfnlist->lock); + reset_start = gfnlist->reset_index; + avail = gfnlist->dirty_index - gfnlist->reset_index; + count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail; + gfnlist->reset_index += count; + spin_unlock(&gfnlist->lock); + + for (j = reset_start; j < reset_start + count; j++) + cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]); + + if (count) + goto vcpu_gfn_loop; + } + + gfnlist = &kvm->mt.gfn_list; + +global_gfn_loop: + + spin_lock(&gfnlist->lock); + reset_start = gfnlist->reset_index; + avail = gfnlist->dirty_index - gfnlist->reset_index; + count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail; + gfnlist->reset_index += count; + spin_unlock(&gfnlist->lock); + + for (j = reset_start; j < reset_start + count; j++) + cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]); + + if (count) + goto global_gfn_loop; + + spin_unlock(&kvm->mmu_lock); + + + if (cleared) + kvm_flush_remote_tlbs(kvm); + + return 0; } static int kvm_vm_ioctl_mt_rearm_gfns(struct kvm *kvm) { - return -EINVAL; + return mt_reset_all_gfns(kvm); +} + +static int mt_unblock_sw(struct kvm *kvm) +{ + struct sublist_waiter *swp; + + if (!kvm->mt.active) + return -EINVAL; + + spin_lock(&kvm->mt.sw_lock); + + kvm->mt.allow_blocking = 0; + + /* Make sure allow_blocking is clear before the wake up */ + mb(); + + swp = &kvm->mt.sw; + wake_up(&swp->wq); + + spin_unlock(&kvm->mt.sw_lock); + + return 0; } static int kvm_vm_ioctl_mt_quiesced(struct kvm *kvm) { - return -EINVAL; + if (!kvm->mt.active) + return -EINVAL; + + kvm->mt.quiesced = 1; + + /* wake up the sublist waiter */ + mt_unblock_sw(kvm); + + if (kvm->mt.gfn_list.overflow) + return -ENOMEM; + + return 0; +} + +static int mt_sublist_req_nowait(struct kvm *kvm, + struct mt_sublist_fetch_info *msfi, int offset) +{ + int i, j, avail, goal = msfi->gfn_info.count; + struct kvm_vcpu *vcpu; + __u64 *gfndst, *gfnsrc; + int rc = 0; + __u64 slot_offset; + int index; + + /* Clearing dirty/write bits requires tlb flush before exit */ + int cleared = 0; + + /* Don't need to lock gfn lists if we're in VM blackout */ + int need_locks = !kvm->mt.quiesced; + + /* Consolidate flags */ + int reset = msfi->flags & MT_FETCH_REARM; + int bmap = kvm->mt.bmap != NULL; + + if (goal == 0) + return 0; + + gfndst = &msfi->gfn_info.gfnlist[offset]; + msfi->gfn_info.count = offset; + + kvm_for_each_vcpu(i, vcpu, kvm) { + int len, rem; + int vcpu_id; + struct gfn_list *gfnlist; + + vcpu_id = vcpu->vcpu_id; + gfnlist = &vcpu->kvm->vcpu_mt[vcpu_id].gfn_list; + + mutex_lock(&gfnlist->mtx); + if (need_locks) + spin_lock(&gfnlist->lock); + + avail = gfnlist->dirty_index - gfnlist->fetch_index; + if (!avail) { + if (need_locks) + spin_unlock(&gfnlist->lock); + mutex_unlock(&gfnlist->mtx); + continue; + } + avail = avail > goal ? goal : avail; + for (j = 0; j < avail; j++) { + index = gfnlist->fetch_index+j; + slot_offset = gfnlist->dirty_gfns[index]; + kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm, + slot_offset); + } + gfnsrc = &kvm->mt.gfn_buf[0]; + + if (need_locks) + spin_unlock(&gfnlist->lock); + + rem = copy_to_user(gfndst, gfnsrc, + avail*sizeof(*gfndst)) / sizeof(*gfndst); + + /* + * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn + * below, but must take mmu_lock _before_ gfnlist lock. + */ + if (reset) + spin_lock(&kvm->mmu_lock); + + if (need_locks) + spin_lock(&gfnlist->lock); + + len = avail - rem; + msfi->gfn_info.count += len; + gfndst += len; + if (reset) { + __u64 gfn; + + for (j = 0; j < len; j++) { + index = gfnlist->fetch_index+j; + slot_offset = gfnlist->dirty_gfns[index]; + gfn = kvm_mt_slot_offset_to_gfn(kvm, + slot_offset); + cleared += + kvm_mt_mmu_reset_gfn(kvm, slot_offset); + if (bmap) + clear_bit(gfn, kvm->mt.bmap); + } + gfnlist->reset_index += len; + } + gfnlist->fetch_index += len; + + if (need_locks) + spin_unlock(&gfnlist->lock); + if (reset) + spin_unlock(&kvm->mmu_lock); + mutex_unlock(&gfnlist->mtx); + + if (len != avail) { + rc = -EFAULT; + goto copy_done_err; + } + + goal -= avail; + if (goal == 0) + break; + } + + /* If we still need more gfns, consult the master list */ + if (goal) { + int len, rem; + struct gfn_list *gfnlist = &kvm->mt.gfn_list; + + mutex_lock(&gfnlist->mtx); + if (need_locks) + spin_lock(&gfnlist->lock); + + avail = gfnlist->dirty_index - gfnlist->fetch_index; + if (!avail) { + if (need_locks) + spin_unlock(&gfnlist->lock); + mutex_unlock(&gfnlist->mtx); + goto copy_done_no_err; + } + avail = avail > goal ? goal : avail; + for (j = 0; j < avail; j++) { + index = gfnlist->fetch_index+j; + slot_offset = gfnlist->dirty_gfns[index]; + kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm, + slot_offset); + } + gfnsrc = &kvm->mt.gfn_buf[0]; + + if (need_locks) + spin_unlock(&gfnlist->lock); + + rem = copy_to_user(gfndst, gfnsrc, + avail*sizeof(*gfndst)) / sizeof(*gfndst); + + /* + * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn + * below, but must take mmu_lock _before_ gfnlist lock. + */ + if (reset) + spin_lock(&kvm->mmu_lock); + + if (need_locks) + spin_lock(&gfnlist->lock); + + len = avail - rem; + msfi->gfn_info.count += len; + gfnlist->fetch_index += len; + if (reset) { + __u64 slot_offset; + __u64 gfn; + + for (j = 0; j < len; j++) { + index = gfnlist->fetch_index+j; + slot_offset = gfnlist->dirty_gfns[index]; + gfn = kvm_mt_slot_offset_to_gfn(kvm, + slot_offset); + cleared += + kvm_mt_mmu_reset_gfn(kvm, slot_offset); + if (bmap) + clear_bit(gfn, kvm->mt.bmap); + } + gfnlist->reset_index += len; + } + + if (need_locks) + spin_unlock(&gfnlist->lock); + if (reset) + spin_unlock(&kvm->mmu_lock); + mutex_unlock(&gfnlist->mtx); + + if (len != avail) { + rc = -EFAULT; + goto copy_done_err; + } + + goal -= avail; + } + +copy_done_no_err: + +copy_done_err: + + if (cleared) + kvm_flush_remote_tlbs(kvm); + + return rc; +} + +static int mt_sublist_req_wait(struct kvm *kvm, + struct mt_sublist_fetch_info *msfi) +{ + struct sublist_waiter *swp; + int goal = msfi->gfn_info.count; + int offset; + int rc; + + if (msfi->gfn_info.count == 0) + return 0; + + spin_lock(&kvm->mt.sw_lock); + if (!kvm->mt.allow_blocking) { + spin_unlock(&kvm->mt.sw_lock); + return -EINVAL; + } + spin_unlock(&kvm->mt.sw_lock); + + rc = mt_sublist_req_nowait(kvm, msfi, 0); + if (rc || (msfi->gfn_info.count == goal)) + return rc; + + offset = msfi->gfn_info.count; + + spin_lock(&kvm->mt.sw_lock); + + if (kvm->mt.sw_busy) { + spin_unlock(&kvm->mt.sw_lock); + return -EBUSY; + } + kvm->mt.sw_busy = 1; + + swp = &kvm->mt.sw; + swp->goal = goal; + + spin_unlock(&kvm->mt.sw_lock); + + rc = wait_event_interruptible(swp->wq, + !kvm->mt.allow_blocking || !swp->goal); + + spin_lock(&kvm->mt.sw_lock); + + kvm->mt.sw_busy = 0; + + spin_unlock(&kvm->mt.sw_lock); + + if (rc) + return rc; + + msfi->gfn_info.count = goal - offset; + + return mt_sublist_req_nowait(kvm, msfi, offset); +} + +static int mt_get_dirty_count(struct kvm *kvm, + struct mt_sublist_fetch_info *msfi) +{ + int i, avail = 0; + struct kvm_vcpu *vcpu; + struct gfn_list *gfnlist; + + /* Don't need to lock gfn lists if we're in VM blackout */ + int need_locks = !kvm->mt.quiesced; + + kvm_for_each_vcpu(i, vcpu, kvm) { + gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list; + + mutex_lock(&gfnlist->mtx); + if (need_locks) + spin_lock(&gfnlist->lock); + avail += gfnlist->dirty_index - gfnlist->fetch_index; + if (need_locks) + spin_unlock(&gfnlist->lock); + mutex_unlock(&gfnlist->mtx); + } + + gfnlist = &kvm->mt.gfn_list; + + mutex_lock(&gfnlist->mtx); + if (need_locks) + spin_lock(&gfnlist->lock); + avail += gfnlist->dirty_index - gfnlist->fetch_index; + if (need_locks) + spin_unlock(&gfnlist->lock); + mutex_unlock(&gfnlist->mtx); + + msfi->gfn_info.count = avail; + + return 0; } static int kvm_vm_ioctl_mt_sublist_fetch(struct kvm *kvm, struct mt_sublist_fetch_info *mtsfi) { - return -EINVAL; + if (!kvm->mt.active) + return -EINVAL; + + if (mtsfi->gfn_info.gfnlist == NULL) + return mt_get_dirty_count(kvm, mtsfi); + + if (mtsfi->gfn_info.count == 0) + return 0; + + if (!(mtsfi->flags & MT_FETCH_WAIT)) + return mt_sublist_req_nowait(kvm, mtsfi, 0); + + return mt_sublist_req_wait(kvm, mtsfi); } static int kvm_vm_ioctl_mt_dirty_trigger(struct kvm *kvm, int dirty_trigger) -- 2.5.0 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html